In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, TargetEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [None]:
# Load the OKCupid data
# Downloaded on January 18 from https://github.com/rudeboybert/JSE_OkCupid/blob/master/profiles_revised.csv.zip

df = pd.read_csv("profiles_revised.csv")

In [None]:
df.info()

In [None]:
# before we go too far, split!
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# How many values in each category?
print("Feature     Count")
# Assuming that object is categorical, could be bad assumption
for feat in train.select_dtypes(include="object").columns:
    print(f"{feat:12}{train[feat].nunique()}")

In [None]:
df["status"].value_counts()
stat_enc = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
stat_one_hot = stat_enc.fit_transform(df[["status"]])
stat_one_hot.head()

In [None]:
# What's going with speaks?
train["speaks"].head(10)

In [None]:
# look at drinks category
print(train["drinks"].value_counts())

# pass a specific list in the order you want encoded
drink_scale=["not at all", "rarely", "socially", "often", "very often", "desperately"]
drink_enc = OrdinalEncoder(categories=[drink_scale], handle_unknown="use_encoded_value", unknown_value=np.nan)
drink_ord = drink_enc.fit_transform(train[["drinks"]])

In [None]:
# Drop nans for demonstration purposes
nona_speaks = train["speaks"].dropna()
encoder = FeatureHasher(n_features=8, input_type="string")
encoded_speaks = encoder.fit_transform(nona_speaks.str.split(", "))

In [None]:
for s, e in zip(nona_speaks[:5], encoded_speaks[:5]):
    print(f"{s}: {e.todense()}")

In [None]:
# income vs education
train.groupby("education")["income"].describe()
edu_encoder = TargetEncoder(target_type="continuous")
edu_feat = edu_encoder.fit_transform(train[["education"]],train["income"])

# Look at the target-encoded values
print(f"{'Education':35} Income")
for cat, enc in sorted(zip(edu_encoder.categories_[0], edu_encoder.encodings_[0]), key=lambda e: e[1], reverse=True):
    print(f"{cat:35} ${enc:,.2f}")

In [None]:
income_buckets = pd.cut(train["income"], bins=[-10000, 30000, 80000, 100000], retbins=True, labels=["low", "medium", "high"])
income_buckets

## Extra plots
Used in lecture slides, but how they're made isn't very exciting

In [None]:
# Discretize age
train["u30"] = train["age"] < 30
bins = np.linspace(10,80,40)
ax = train.query("u30 == 1")["age"].hist(label="Under 30", bins=bins)
train.query("u30 == 0")["age"].hist(label="30+", ax=ax, bins=bins)
plt.legend()
plt.xlabel("Age")
plt.ylabel("Frequency")

plt.savefig("../../static/img/04-discrete.png")