# Libraries

In [45]:
%matplotlib inline
from collections import Counter
from collections import defaultdict
import enum

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import sparse
import re
import nltk

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import RocCurveDisplay
from sklearn.pipeline import Pipeline

import seaborn

# Load data

In [29]:
#####################
# Utility Functions #
#####################

def get_dataframes_from_csv(path_to_features, path_to_targets = None):
    """
    Get Pandas DataFrames for features and targets
    """
    features_dataframe = pd.read_csv(path_to_features, encoding="utf8")
    
    if not path_to_targets:
        return features_dataframe
    
    targets_dataframe = pd.read_csv(path_to_targets)
    return features_dataframe, targets_dataframe

def get_features_from_dataframe(features_dataframe) -> tuple[list, list, list, list]:
    """
    Get all feature columns as lists from DataFrame
    """
    usernames = list(features_dataframe.author)
    comments = list(features_dataframe.body)
    subreddits = list(features_dataframe.subreddit)
    created_at = list(features_dataframe.created_at)
    return usernames, comments, subreddits, created_at

def get_targets_from_dataframe(features_dataframe, targets_dataframe) -> list[int]:
    """
    Get targets as lists from DataFrame
    """
    targets_dictionary : dict[str, int] = dict(zip(targets_dataframe.author, 
                                                   targets_dataframe.gender))
    targets : list[int] = list(map(lambda a: targets_dictionary[a], features_dataframe.author))
    return targets

In [42]:
#############
# Execution #
#############

training_features_dataframe, training_targets_dataframe = get_dataframes_from_csv("data/train_data.csv", 
                                                                             "data/train_target.csv")
training_comments : list[str] = list(training_features_dataframe.body)
training_targets : list[int] = get_targets_from_dataframe(training_features_dataframe, 
                                                          training_targets_dataframe)

print(training_comments[:3])
print(training_targets[:3])

["I don't think we'd get nearly as much fanfiction and pictures shipping Ban-Ban and Lyro. Just saying.", "Thanks. I made it up, that's how I got over my first heart break. ", "Are you sure you aren't confusing Cyclops (the easiest boss monster) for Ogres? I'm talking about [these guys](http://i.imgur.com/c3YKPdI.jpg)\n\nMaybe I'm just a bad player... But every time I faced one on my first playthrough, all my pawns ended up getting to 0 HP at least once and I could barely get an attack in once it started berserking."]
[0, 1, 0]


# Group by author

In [36]:
#####################
# Utility Functions #
#####################

def group_dataframe_by_author(features_dataframe):
    """
    Group all features in the dataframe by author.
    """
    return features_dataframe.groupby('author', as_index=False).agg({
                         'subreddit':join_strings, 
                         'body':join_strings, 
                         'created_utc': join_ints})

def join_strings(x : List[str]):
    """
    Join all elements of a list/iterable of strings with a white-space in-between.
    """
    return ' '.join(x)

def join_ints(x : List[int]):
    """
    Join all elements of a list/iterable of ints with a comma in-between.
    """
    return ','.join(map(lambda i: str(i), x))


In [43]:
#############
# Execution #
#############

training_features_dataframe_groupby_author = group_dataframe_by_author(training_features_dataframe)

training_comments_groupby_author : List[str] = list(training_features_dataframe_groupby_author.body)
training_targets_groupby_author : List[int] = get_targets_from_dataframe(training_features_dataframe_groupby_author, training_targets_dataframe)

print(training_features_dataframe_groupby_author.head())
print(training_comments_groupby_author[:1])
print(training_targets_groupby_author[:5])

           author                                          subreddit  \
0          -Jared   AskReddit tall pics StarWars AskReddit AskReddit   
1         -Peeter                                             gainit   
2        -evasian  MouseReview MechanicalKeyboards jailbreak jail...   
3         -rubiks  AskWomen AskWomen AskWomen AskWomen AskWomen A...   
4  -true_neutral-                    mildlyinteresting todayilearned   

                                                body  \
0  Neil Diamond - Sweet Caroline +1 on the chirop...   
1                         Just read the FAQ, really.   
2  I just received my Deathadder Black Edition ye...   
3  AlunaGeorge - Best Be Believing\nArctic Monkey...   
4  &gt; Urban Ears Metis\n\nYMMV. I have had thre...   

                                         created_utc  
0  1390189315.0,1390189970.0,1390492589.0,1390496...  
1                                       1389962703.0  
2  1388678755.0,1388688144.0,1389891805.0,1389892...  
3  1389194

# Clean textual data

In [55]:
#####################
# Utility Constants #
#####################
class TokenPatterns(enum.Enum):
    """
    Exposes useful regex patterns for CountVectorizer and TfIdfVectorizer
    """
    TKN_ALL_BUT_WHITESPACE = r'[^\s]+'
    """Match all groups of chars separated by whitespaces"""

    TKN_3_OR_MORE_ALPHA = '(?u)\\b[A-Za-z]{3,}'
    """Match whole words of alphabetical chars of three or more chars"""

    TKN_1_OR_MORE_ALPHA = '(?u)\\b[A-Za-z]{1,}'
    """Match whole words of alphabetical chars of one or more chars"""

#####################
# Utility Functions #
#####################

In [None]:
##############
# Execution #
#############

# Generate targets

# Generate Part-Of-Speech data

# Generate subreddit data

# Generate timestamp data

# Generate CV-TFIDF comment data

# Generate username data

# Latent Semantic Analysis

# Fit together/separately and with different estimators

## Pipeline

## Grid Search and Cross-Validation

# Ensemble classifier

# Evaluate on Test Set