In [18]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os

import numpy as np
import pandas as pd
from scipy import sparse

In [2]:
data_dir = 'data'

# Load the train data

In [3]:
train_data = pd.read_csv(f"{data_dir}/train_data.csv", encoding="utf8")
train_data.head()

Unnamed: 0,author,subreddit,created_utc,body
0,Shamus_Aran,mylittlepony,1388534000.0,I don't think we'd get nearly as much fanficti...
1,Riddance,sex,1388534000.0,"Thanks. I made it up, that's how I got over my..."
2,Secret_Wizard,DragonsDogma,1388534000.0,Are you sure you aren't confusing Cyclops (the...
3,Penultimatum,malefashionadvice,1388534000.0,dont do this to me bro
4,7-SE7EN-7,todayilearned,1388534000.0,That's what we do when we can't find a mate


In [4]:
train_data.author.unique().shape

(5000,)

In [5]:
target = pd.read_csv(f"{data_dir}/train_target.csv", encoding="utf8")
target.head()  # 1 - male; 0 - female

Unnamed: 0,author,gender
0,RedThunder90,0
1,Lirkmor,1
2,In0chi,0
3,ProjectGrudge,0
4,TehTurtleHermit,0


# Feature Extraction

In [25]:
subreddits = train_data.subreddit.unique()
# subreddits_map is a series that associates a subreddit with an index
# why this is absolutely necessary is still unclear
subreddits_map = pd.Series(index=subreddits, data=np.arange(subreddits.shape[0]))

In [26]:
def extract_features(group: pd.DataFrame) -> sparse.csr_matrix:
    """
    This function converts all the subreddits the author has posted in into a 1xN sparse
    matrix (where N is the number of subreddits in the dataset) with 1s in the indexes
    of the subreddits the author has posted in.
    """
    # this basically converts group['subreddit'] to an array of subreddits
    # why on earth it is done like this is beyond me
    group_subreddits = group['subreddit']
    group_subreddits = group_subreddits[group_subreddits.isin(subreddits_map.index)].values

    # idxs is an array with the indexes of the subreddits in the subreddits_map
    idxs = subreddits_map.loc[group_subreddits].values

    # create a sparse matrix with 1s in the indexes of the subreddits the author has posted in
    v = sparse.dok_matrix((1, len(subreddits))) # dok = dictionary of keys; why not use dok_array?
    for idx in idxs:
        if not np.isnan(idx):  # is this really necessary?
            v[0, idx] = 1
    return v.tocsr()  # convert to compressed sparse row format

extract_features(train_data[train_data.author=='RedThunder90'])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1 stored elements and shape (1, 3468)>

In [27]:
# Create a dictionary mapping the author to the sparse matrix of subreddits they have
# posted in

features_dict: dict[str, pd.DataFrame] = {}

for author, group in train_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [28]:
# Generate a sparse matrix with the labelled authors as rows and the subreddits they
# have posted in as columns

X = sparse.vstack([features_dict[author] for author in target.author])
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 49152 stored elements and shape (5000, 3468)>

In [29]:
y = target.gender

In [31]:
def extract_text(group: pd.DataFrame) -> str:
    """
    Concatenates all the posts of an author into a single string.
    """
    group_text = group['body'].astype(str).values
    return " ".join(group_text)

extract_text(train_data[train_data.author == "RedThunder90"])

'I still prefer to buy foods either grown locally or where animals are treated better, but this definitely has me looking at organic food differently.'

In [32]:
# Create a dictionary mapping the author to the text of all their posts

text_dict: dict[str, str] = {}

for author, group in train_data.groupby('author'):
    text_dict[author] = extract_text(group)

In [34]:
# Generate a list with the labelled authors as indexes and the text of all their posts
# in the respective position

author_text = [text_dict[author] for author in target.author]
author_text[0]

'I still prefer to buy foods either grown locally or where animals are treated better, but this definitely has me looking at organic food differently.'

At this point I have two data structures:
- `X`: a sparse matrix N_authors x N_subreddits linking all the authors with the subreddits they have posted in
- `author_text`: a list of length N_authors containing in position i all the text posted by author i

# Model Selection

In [16]:
# YOUR CODE HERE

class Model():
    def predict_proba(self, X):
        return np.zeros((X.shape[0], 2))
    
model = Model()

# Prepare the solution

In [17]:
test_data = pd.read_csv(f"{data_dir}/test_data.csv", encoding="utf8")

In [18]:
features_dict: dict[str, pd.DataFrame] = {}

for author, group in test_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [19]:
X_test = sparse.vstack([features_dict[author] for author in test_data.author.unique()])
X_test

<15000x3468 sparse matrix of type '<class 'numpy.float64'>'
	with 144898 stored elements in Compressed Sparse Row format>

In [20]:
text_dict = {}

for author, group in test_data.groupby('author'):
    text_dict[author] = extract_text(group)

In [21]:
author_text_test = [text_dict[author] for author in test_data.author.unique()]

In [22]:
author_text_test[0][:100]

"I hadn't ever heard of them before joining this subreddit. They're not really a big thing in the US,"

In [23]:
y_pred = model.predict_proba(X_test)[:,1]

In [24]:
solution = pd.DataFrame({"author": test_data.author.unique(), "gender": y_pred})
solution.head()

Unnamed: 0,author,gender
0,ejchristian86,0.0
1,ZenDragon,0.0
2,savoytruffle,0.0
3,hentercenter,0.0
4,rick-o-suave,0.0


In [25]:
solution.to_csv("submission.csv", index=False)

Now go to [Kaggle](https://www.kaggle.com/competitions/datamining2024/overview), click "Submit Prediction" and upload the file "submission.csv" to see the test score.