# Data Mining Challange: *Reddit Gender Text-Classification* (MLP) 

### Modules

In [1]:
# Numpy & matplotlib for notebooks 
%pylab inline

# Pandas for data analysis and manipulation 
import pandas as pd 

# Sklearn 
from sklearn.preprocessing import StandardScaler # to standardize features by removing the mean and scaling to unit variance (z=(x-u)/s)
from sklearn.neural_network import MLPClassifier # Multi-layer Perceptron classifier which optimizes the log-loss function using LBFGS or sdg.
from sklearn.model_selection import train_test_split # to split arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold # K-Folds cross-validator providing train/test indices to split data in train/test sets.
from sklearn.decomposition import PCA, TruncatedSVD # Principal component analysis (PCA); dimensionality reduction using truncated SVD.
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB # Naive Bayes classifier for multinomial models
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.metrics import roc_auc_score as roc # Compute Area Under the Receiver Operating Characteristic Curve from prediction scores
from sklearn.metrics import roc_curve, auc # Compute ROC; Compute Area Under the Curve (AUC) using the trapezoidal rule

# Matplotlib
import matplotlib # Data visualization
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches  

# Seaborn
import seaborn as sns # Statistical data visualization (based on matplotlib)

Populating the interactive namespace from numpy and matplotlib


### Data Collection 

In [2]:
# Import the training dataset, test dataset and target

# Import the training dataset
train_data = pd.read_csv("../input/dataset/train_data.csv", encoding="utf8")

# Import the test dataset
test_data = pd.read_csv("../input/dataset/test_data.csv", encoding="utf8")

# Import the target
target = pd.read_csv("../input/dataset/train_target.csv")

# Create a dictionary of authors
author_gender = {}
for i in range(len(target)):
    author_gender[target.author[i]] = target.gender[i]

### Data Manipulation 

In [3]:
# Create a list of aggregated binary subreddits 
Xs = []
# Create a list of genders
y = []
# Create a list of authors
a = []

# Populate the lists 
for author, group in train_data.groupby("author"):
    Xs.append(group.subreddit.str.cat(sep = " "))
    y.append(author_gender[author])
    a.append(author)
    
# Lower text in comments 
clean_train_subreddits = [xs.lower() for xs in Xs]

### Models Definition & Training

#### CountVectorizer

In [4]:
# Define CountVectorizer  
vectorizer_ = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,
                             binary=True
                             ) #500
# Train CountVectorizer  
train_data_subreddits = vectorizer_.fit_transform(clean_train_subreddits).toarray()

sum(train_data_subreddits[1])

y = np.array(y)

#### MLP Classifier

In [5]:
# Define MLP Classifier:
## Activation function for the hidden layer: "rectified linear unit function"
## Solver for weight optimization: "stochastic gradient-based optimizer"
## Alpha: regularization parameter
## Learning rate schedule for weight updates: "gradually decreases the learning rate at each time step t using an inverse scaling exponent of power_t"
## Verbose: "True" in order to print progress messages to stdout.
## Early stopping: "True" in order to use early stopping to terminate training when validation score is not improving. It automatically sets aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.

mlpClf = MLPClassifier(activation= 'relu', solver = 'adam', 
                       alpha = 0.05, learning_rate = 'invscaling', verbose = True, 
                       early_stopping = True, max_iter = 400, random_state=0)

        
# K fold per la cross-validation
kfold = KFold(n_splits = 10)

# Training and validation on all K folds
# for train_indices, test_indices in kf.split(train_data_subreddits):
#    mlpClf.fit(train_data_subreddits[train_indices], y[train_indices])
#    print(mlpClf.score(train_data_subreddits[test_indices], y[test_indices]))
    
# cross_val_score resets parameters of my_model and fits it on X_train and t_train with cross validation (we did it for consistency).
# results = cross_val_score(my_model, s, y, cv=kfold, scoring='roc_auc')
# print("roc = ", np.mean(results))
    
# Model fit
mlpClf.fit(train_data_subreddits, y)

Iteration 1, loss = 0.59613047
Validation score: 0.734000
Iteration 2, loss = 0.47953355
Validation score: 0.814000
Iteration 3, loss = 0.39179575
Validation score: 0.864000
Iteration 4, loss = 0.33398556
Validation score: 0.870000
Iteration 5, loss = 0.29788160
Validation score: 0.860000
Iteration 6, loss = 0.27413851
Validation score: 0.858000
Iteration 7, loss = 0.25758394
Validation score: 0.856000
Iteration 8, loss = 0.24291078
Validation score: 0.858000
Iteration 9, loss = 0.23275980
Validation score: 0.864000
Iteration 10, loss = 0.22383857
Validation score: 0.860000
Iteration 11, loss = 0.21650923
Validation score: 0.860000
Iteration 12, loss = 0.21024405
Validation score: 0.850000
Iteration 13, loss = 0.20492907
Validation score: 0.850000
Iteration 14, loss = 0.20017990
Validation score: 0.848000
Iteration 15, loss = 0.19573230
Validation score: 0.850000
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='invscaling',
              learning_rate_init=0.001, max_fun=15000, max_iter=400,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=0, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

### Prediction 

In [6]:
Xs_test = []
for author, group in test_data.groupby("author"):
    Xs_test.append(group.subreddit.str.cat(sep = " "))
    
clean_test_subreddits = [xs.lower() for xs in Xs_test]

test_data_subreddits = vectorizer_.transform(clean_test_subreddits).toarray()

y_score = mlpClf.predict_proba(test_data_subreddits)[:,1]

np.save("y_testMLPs",y_score)