# Data Mining Challange: *Reddit Gender Text-Classification* (MLP) 

### Modules

In [1]:
# Numpy & matplotlib for notebooks 
%pylab inline

# Pandas for data analysis and manipulation 
import pandas as pd 

# Sklearn 
from sklearn.preprocessing import StandardScaler # to standardize features by removing the mean and scaling to unit variance (z=(x-u)/s)
from sklearn.neural_network import MLPClassifier # Multi-layer Perceptron classifier which optimizes the log-loss function using LBFGS or sdg.
from sklearn.model_selection import train_test_split # to split arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold # K-Folds cross-validator providing train/test indices to split data in train/test sets.
from sklearn.decomposition import PCA, TruncatedSVD # Principal component analysis (PCA); dimensionality reduction using truncated SVD.
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB # Naive Bayes classifier for multinomial models
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.metrics import roc_auc_score as roc # Compute Area Under the Receiver Operating Characteristic Curve from prediction scores
from sklearn.metrics import roc_curve, auc # Compute ROC; Compute Area Under the Curve (AUC) using the trapezoidal rule

# Matplotlib
import matplotlib # Data visualization
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches  

# Seaborn
import seaborn as sns # Statistical data visualization (based on matplotlib)

Populating the interactive namespace from numpy and matplotlib


### Data Collection 

In [14]:
# Import the training dataset, test dataset and target

# Import the training dataset
train_data = pd.read_csv("train_data.csv", encoding="utf8")

# Import the test dataset
test_data = pd.read_csv("test_data.csv", encoding="utf8")

# Import the target
target = pd.read_csv("train_target.csv")

# Create a dictionary of authors
author_gender = {}
for i in range(len(target)):
    author_gender[target.author[i]] = target.gender[i]

### Data Manipulation 

In [6]:
# Create a list of aggregated binary subreddits 
Xs = []
# Create a list of genders
y = []
# Create a list of authors
a = []

# Populate the lists 
for author, group in train_data.groupby("author"):
    Xs.append(group.subreddit.str.cat(sep = " "))
    y.append(author_gender[author])
    a.append(author)
    
# Lower text in comments 
clean_train_subreddits = [xs.lower() for xs in Xs]

In [None]:
a

### Models Definition & Training

#### CountVectorizer

In [8]:
# Define CountVectorizer  
vectorizer_ = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,
                             binary=True
                             ) #500
# Train CountVectorizer  
train_data_subreddits = vectorizer_.fit_transform(clean_train_subreddits).toarray()

sum(train_data_subreddits[1])

y = np.array(y)

#### MLP Classifier

In [12]:
# Define MLP Classifier:
## Activation function for the hidden layer: "rectified linear unit function"
## Solver for weight optimization: "stochastic gradient-based optimizer"
## Alpha: regularization parameter
## Learning rate schedule for weight updates: "gradually decreases the learning rate at each time step t using an inverse scaling exponent of power_t"
## Verbose: "True" in order to print progress messages to stdout.
## Early stopping: "True" in order to use early stopping to terminate training when validation score is not improving. It automatically sets aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.

mlpClf = MLPClassifier(activation= 'relu', solver = 'adam', 
                       alpha = 0.05, learning_rate = 'invscaling', verbose = True, 
                       early_stopping = True, max_iter = 400, random_state=0)

        
# K fold per la cross-validation
kf = KFold(n_splits = 8)

# Training and validation on all K folds
for train_indices, test_indices in kf.split(X_train):
    mlpClf.fit(X_train[train_indices], y_train[train_indices])
    print(mlpClf.score(X_train[test_indices], y_train[test_indices]))

Iteration 1, loss = 0.60899191
Validation score: 0.721461
Iteration 2, loss = 0.49872264
Validation score: 0.757991
Iteration 3, loss = 0.41251641
Validation score: 0.831050
Iteration 4, loss = 0.35127469
Validation score: 0.847032
Iteration 5, loss = 0.31168246
Validation score: 0.853881
Iteration 6, loss = 0.28519341
Validation score: 0.853881
Iteration 7, loss = 0.26592054
Validation score: 0.853881
Iteration 8, loss = 0.25054187
Validation score: 0.851598
Iteration 9, loss = 0.23895993
Validation score: 0.863014
Iteration 10, loss = 0.22934773
Validation score: 0.860731
Iteration 11, loss = 0.22117200
Validation score: 0.863014
Iteration 12, loss = 0.21420363
Validation score: 0.858447
Iteration 13, loss = 0.20825104
Validation score: 0.858447
Iteration 14, loss = 0.20350864
Validation score: 0.863014
Iteration 15, loss = 0.19891814
Validation score: 0.865297
Iteration 16, loss = 0.19472937
Validation score: 0.863014
Iteration 17, loss = 0.19118028
Validation score: 0.863014
Iterat

### Prediction 

In [15]:
Xs_test = []
for author, group in test_data.groupby("author"):
    Xs_test.append(group.subreddit.str.cat(sep = " "))
    
Xs_test[1]

clean_test_subreddits = [xs.lower() for xs in Xs_test]

test_data_subreddits = vectorizer_.transform(clean_test_subreddits).toarray()

y_score = mlpClf.predict_proba(test_data_subreddits)[:,1]

len(y_score)

np.save("y_testMLPs",y_score)