# Reddit user gender classification

### Libraries and configuration

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import sparse

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
plt.style.use(['grid', 'science', 'notebook', 'mylegend'])

data_dir = 'data'

## Load the training and test data

In [None]:
def load_data(load_test: bool = False) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train_data = pd.read_csv(f'{data_dir}/train_data.csv')
    target = pd.read_csv(f'{data_dir}/train_target.csv')
    if load_test:
        test_data = pd.read_csv(f'{data_dir}/test_data.csv')
    else:
        test_data = pd.DataFrame()
    return train_data, target, test_data

In [None]:
train_data, target, _ = load_data()

print(f"Number of authors in training set: {train_data["author"].unique().shape[0]}")

Number of authors: 5000


## Feature extraction

In [5]:
def create_subreddit_idx(data: pd.DataFrame) -> pd.Series:
    """Map every subreddit to a unique integer."""
    subreddits = data["subreddit"].unique()
    return pd.Series(index=subreddits, data=np.arange(len(subreddits)))

In [11]:
def extract_subreddits(
    author_data: pd.DataFrame,
    subreddit_idx: pd.Series,
) -> sparse.csr_array:
    """
    This function converts all the subreddits the author has posted in into a sparse
    array of length N (where N is the number of subreddits in the dataset) with 1s in
    the indexes of the subreddits the author has posted in.
    """
    user_subreddits = author_data["subreddit"].to_numpy()

    # idxs is an array with the indexes of the subreddits in subreddits_idx
    idxs = subreddit_idx.loc[user_subreddits].to_numpy()

    # create a sparse array indicating the subreddits the author has posted in
    v = sparse.dok_array(shape=(1, len(subreddit_idx)))  # dok = dictionary of keys
    for idx in idxs:
        v[0, idx] = 1
    return v.tocsr()  # convert to compressed sparse row format

In [None]:
def extract_text(author_data: pd.DataFrame) -> str:
    """Returns all the posts of an author as a single string."""
    group_text = author_data["body"].astype(str).to_numpy()
    return " ".join(group_text)

In [13]:
def vectorize_text(vectorizer: TfidfVectorizer, text: str) -> sparse.csr_array:
    pass

In [None]:
def extract_features(
    data: pd.DataFrame,
    subreddit_idx: pd.Series,
    vectorizer: TfidfVectorizer,
    *,
    target: pd.DataFrame | None = None,
) -> tuple[sparse.csr_matrix, pd.Series] | sparse.csr_matrix:
    """Extract features from the data."""

    subreddits_dict: dict[str, sparse.csr_array] = {}
    for author, group in data.groupby("author"):
        subreddits_dict[author] = extract_subreddits(group, subreddit_idx)

    # Generate a sparse matrix with the labelled authors as rows and the subreddits they
    # have posted in as columns
    subreddits_matrix: sparse.csr_matrix = sparse.vstack(
        [subreddits_dict[author] for author in target["author"]]
    )

    text_dict: dict[str, str] = {}
    for author, group in data.groupby("author"):
        text_dict[author] = extract_text(group)

    author_text: list[str] = [text_dict[author] for author in target["author"]]
    text_features = vectorize_text(vectorizer, author_text)

    # print(type(text_features))

    X = sparse.hstack([subreddits_matrix, text_features])

    if target is None:
        return X
    else:
        y: pd.Series = target["gender"]
        return X, y

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_data, target, test_size=0.2, random_state=42)

In [None]:
subreddit_idx = create_subreddit_idx(train_data)
vectorizer = TfidfVectorizer(max_df=0.95, stop_words="english", max_features=10000)  # max_features needs to be tuned !!!

In [None]:
X_train, y_train = extract_features(X_train, subreddit_idx, vectorizer, target=y_train)
X_val, y_val = extract_features(X_val, subreddit_idx, vectorizer, target=y_val)