# Reddit user gender classification

### Libraries and configuration

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
plt.style.use(['grid', 'science', 'notebook', 'mylegend'])

data_dir = 'data'

## Load the training and test data

In [None]:
def load_data(load_test: bool = False) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train_data = pd.read_csv(f'{data_dir}/train_data.csv')
    target = pd.read_csv(f'{data_dir}/train_target.csv')
    if load_test:
        test_data = pd.read_csv(f'{data_dir}/test_data.csv')
    else:
        test_data = pd.DataFrame()
    return train_data, target, test_data

In [None]:
train_data, target, test_data = load_data(load_test=True)

print(f"Number of authors in training set: {train_data["author"].unique().shape[0]}")

## Feature extraction

In [None]:
def create_subreddit_idx(data: pd.DataFrame) -> pd.Series:
    """Map every subreddit to a unique integer."""
    subreddits = data["subreddit"].unique()
    return pd.Series(index=subreddits, data=np.arange(len(subreddits)))

In [None]:
def extract_subreddits(
    author_data: pd.DataFrame,
    subreddit_idx: pd.Series,
) -> sp.csr_array:
    """
    This function converts all the subreddits the author has posted in into a sparse
    array of length N (where N is the number of subreddits in the dataset) with 1s in
    the indexes of the subreddits the author has posted in.
    """
    user_subs = author_data["subreddit"]
    subs_in_idx = user_subs.isin(subreddit_idx.index)
    user_subs = user_subs[subs_in_idx].to_numpy()

    # idxs is an array with the indexes of the subreddits in subreddits_idx
    idxs = subreddit_idx.loc[user_subs].to_numpy()

    # create a sparse array indicating the subreddits the author has posted in
    v = sp.dok_array((1, len(subreddit_idx)))  # dok = dictionary of keys
    for idx in idxs:
        v[0, idx] = 1
    return v.tocsr()  # convert to compressed sparse row format

In [None]:
def extract_text(author_data: pd.DataFrame) -> str:
    """Returns all the posts of an author as a single string."""
    group_text = author_data["body"].astype(str).to_numpy()
    return " ".join(group_text)

In [None]:
def vectorize_text(
    vectorizer: TfidfVectorizer,
    text: list[str],
    data_is_test: bool,
) -> sp.csr_array:
    """
    This function vectorizes the text of an author using the provided vectorizer.
    If the data is test data, the vectorizer is only transformed, otherwise it is fit
    and transformed.
    """
    if data_is_test:
        return vectorizer.transform(text)
    else:
        return vectorizer.fit_transform(text)

In [None]:
def extract_features(
    data: pd.DataFrame,
    subreddit_idx: pd.Series,
    vectorizer: TfidfVectorizer,
    *,
    target: pd.DataFrame | None = None,
) -> tuple[sp.csr_matrix, pd.Series] | sp.csr_matrix:
    """Extract features from the data."""

    data_is_test = True if target is None else False

    subs_dict: dict[str, sp.csr_array] = {}
    for author, group in data.groupby("author"):
        subs_dict[author] = extract_subreddits(group, subreddit_idx)

    if data_is_test:
        authors = data["author"].unique()
    else:
        authors = target["author"]

    # Generate a sparse matrix with the authors as rows
    # and the subreddits they have posted in as columns
    subs_matrix: sp.csr_matrix = sp.vstack([subs_dict[author] for author in authors])

    text_dict: dict[str, str] = {}
    for author, group in data.groupby("author"):
        text_dict[author] = extract_text(group)

    author_text: list[str] = [text_dict[author] for author in authors]
    text_features = vectorize_text(vectorizer, author_text, data_is_test)

    # print(type(text_features))

    X = sp.hstack([subs_matrix, text_features])

    if data_is_test:
        return X
    else:
        y: pd.Series = target["gender"]
        return X, y

In [None]:
subreddit_idx = create_subreddit_idx(train_data)
vectorizer = TfidfVectorizer(max_df=0.95, stop_words="english", max_features=10000)  # max_features needs to be tuned !!!

In [None]:
X, y = extract_features(train_data, subreddit_idx, vectorizer, target=target)
# X_test = extract_features(test_data, subreddit_idx, vectorizer)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Model selection

Define a set of models to try on the dataset. Then, for each model, perform hyperparameters tuning using `GridSearchCV`. Finally, pick the best model overall.

In [None]:
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neural_network import MLPClassifier as MLP

classifiers = {"LogReg": LogReg(),
               "SVM": SVC(probability=True),
               "KNN": KNN(),
               "Naive Bayes": NB(),
               "Decision Tree": DT(),
               "Gradient Boosting": GBC(),
               "Random Forest": RF(),
               "MultiLayer Perceptron": MLP(),
}

for name, clf in classifiers.items():
    print(f"{name} -- parameters: {clf.get_params()}")

In [None]:
param_grids = [{'C': np.logspace(0, 3)},
               {'C': np.logspace(0, 3), "kernel": ['linear', 'poly', 'rbf'], "degree": np.arange(2, 5), "gamma": ['scale', 'auto', 1.0e-3, 1.0e-4]},
               {'n_neighbors': np.arange(1, 10), "weights": ['uniform', 'distance']},
               {},
               {'max_depth': np.arange(1, 10), "min_samples_split": np.arange(2, 10)},
               {'n_estimators': np.arange(1, 100, 10), "learning_rate": np.logspace(-3, 0), "max_depth": np.arange(1, 5)},
               {'n_estimators': np.arange(1, 100, 10)},
               {'hidden_layer_sizes': [(100,), (200,), (300,)], "activation": ["logistic", "tanh", "relu"], "solver": ["adam", "sgd"], "alpha": np.logspace(-3, 0), "learning_rate": ["constant", "adaptive"], "learning_rate_init": np.logspace(-3, 0), "early_stopping": [True, False]},
            ]

best_clfs = {}
best_pars = {}
for name, clf, param_grid in zip(classifiers.items(), param_grids):
    search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc')
    search.fit(X, y)
    best_clfs[name] = search.best_estimator_
    best_pars[name] = search.best_params_

best_scores = {}
for name, clf in best_clfs.items():
    scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
    best_scores[name] = scores

In [None]:
print("Classifier           Score")
for name, score in best_scores.items():
    print(f"{name:21}{score.mean():.6f} +/- {score.std():.6f}")