# Data Mining Challange: *Reddit Gender Text-Classification*

## Modules

In [1]:
# Numpy & matplotlib for notebooks 
%pylab inline

# Pandas 
import pandas as pd # Data analysis and manipulation 

# Sklearn 
from sklearn import utils
from sklearn.preprocessing import StandardScaler # to standardize features by removing the mean and scaling to unit variance (z=(x-u)/s)
from sklearn.neural_network import MLPClassifier # Multi-layer Perceptron classifier which optimizes the log-loss function using LBFGS or sdg.
from sklearn.model_selection import train_test_split # to split arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold # K-Folds cross-validator providing train/test indices to split data in train/test sets.
from sklearn.decomposition import PCA, TruncatedSVD # Principal component analysis (PCA); dimensionality reduction using truncated SVD.
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB # Naive Bayes classifier for multinomial models
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.metrics import roc_auc_score as roc # Compute Area Under the Receiver Operating Characteristic Curve from prediction scores
from sklearn.metrics import roc_curve, auc # Compute ROC; Compute Area Under the Curve (AUC) using the trapezoidal rule

# Matplotlib
import matplotlib # Data visualization
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches  

# Seaborn
import seaborn as sns # Statistical data visualization (based on matplotlib)

# Tqdm 
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

# Gensim 
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# Regular Expressions
import re # String manipulation

# Nltk
import nltk # lemmatization
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk import pos_tag  
from nltk.corpus import wordnet as wn  
from nltk.stem.snowball import SnowballStemmer # stemmer

from bs4 import BeautifulSoup   
from collections import defaultdict

Populating the interactive namespace from numpy and matplotlib


## Data Loading and Manipulation

In [2]:
# Load datasets
train_data = pd.read_csv("../input/dataset/train_data.csv")
target = pd.read_csv("../input/dataset/train_target.csv")
test_data = pd.read_csv("../input/dataset/test_data.csv")

# Create author's gender dictionary
author_gender = {}
for i in range(len(target)):
    author_gender[target.author[i]] = target.gender[i]
    
# X is the list of aggregated comments   
X = []

# y is the list of genders
y = []

# Populate the dictionary with keys ("authors") and values ("gender")
for author, group in train_data.groupby("author"):
    X.append(group.body.str.cat(sep = " "))
    y.append(author_gender[author])

# Same thing with test dataset
X_test = []
authors_test = []
for author, group in test_data.groupby("author"):
    X_test.append(group.body.str.cat(sep = " "))
    authors_test.append(author)

## Preprocessing, Optimize Input for doc2vec Training

In [4]:
# Create pre-processing functions
def remove_number(text):
    num = re.compile(r'[-+]?[.\d]*[\d]+[:,.\d]*')
    return num.sub(r'NUMBER', text)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'URL',text)

def remove_repeat_punct(text):
    rep = re.compile(r'([!?.]){2,}')
    return rep.sub(r'\1 REPEAT', text)

def remove_elongated_words(text):
    rep = re.compile(r'\b(\S*?)([a-z])\2{2,}\b')
    return rep.sub(r'\1\2 ELONG', text)

def remove_allcaps(text):
    caps = re.compile(r'([^a-z0-9()<>\'`\-]){2,}')
    return caps.sub(r'ALLCAPS', text)

def transcription_smile(text):
    eyes = "[8:=;]"
    nose = "['`\-]"
    smiley = re.compile(r'[8:=;][\'\-]?[)dDp]')
    #smiley = re.compile(r'#{eyes}#{nose}[)d]+|[)d]+#{nose}#{eyes}/i')
    return smiley.sub(r'SMILE', text)

def transcription_sad(text):
    eyes = "[8:=;]"
    nose = "['`\-]"
    smiley = re.compile(r'[8:=;][\'\-]?[(\\/]')
    return smiley.sub(r'SADFACE', text)

def transcription_heart(text):
    heart = re.compile(r'<3')
    return heart.sub(r'HEART', text)

# Tags Part of Speech (POS), because the lemmatizer requires it
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

# Create lemmatizer
word_Lemmatized = WordNetLemmatizer()

def review_to_words1(raw_body):
    # remove html tags
    body_text = BeautifulSoup(raw_body).get_text() 
    #letters_only = re.sub("[^a-zA-Z]", " ", body_text) 
    # lowercase all text
    words = body_text.lower()
    # remove urls
    text = remove_URL(words)
    # remove numbers
    text = remove_number(text)
    # remove smiles
    text = transcription_sad(text)
    text = transcription_smile(text)
    text = transcription_heart(text)
    text = remove_elongated_words(text)
    words = remove_repeat_punct(text)
    # tokenizes and pass to lemmatizer, which lemmatizes taking tags into account (see before)
    words = word_tokenize(words)
    # we don't remove stop words, because doing it on combination with removing the 40 (trial & error estimated parameter) most utilized words (see below) decreases performance
    #stops = set(stopwords.words("english"))                  
    #meaningful_words = [w for w in words if not w in stops]
    Final_words = []
    for word, tag in pos_tag(words):
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    #if len(Final_words)<11: return -1
    # returns lemmatized texts as strings 
    return( " ".join(Final_words))   

In [5]:
clean_train_comments = [review_to_words1(x) for x in X]
clean_comments_test = [review_to_words1(x) for x in X_test]

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [6]:
# This function formats the input dor doc2vec
def label_sentences(corpus, label_type):
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

In [7]:
y = np.array(y)

In [8]:
# Create doc2vec input
X_train = label_sentences(clean_train_comments, 'Train')
X_test = label_sentences(clean_comments_test, 'Test')
all_data = X_train + X_test

## `doc2vec`: Model Definition and Training

In [9]:
# Define the model
# window: qhow many neighboring words should the moel look at
# negative :som words are negatively weighted
# min_count: once-appearing words are discarded
model_dbow = Doc2Vec(dm=1, vector_size=400, window=7, negative=5, min_count=1, alpha=0.065)
# creates the vocabulary. tdqm is the progress bar
model_dbow.build_vocab([x for x in tqdm(all_data)])
# trianing. The sub doc2vec is trained on training and test set
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 20000/20000 [00:00<00:00, 1082750.31it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1277524.33it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1186205.49it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1188440.60it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1336574.36it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1210826.79it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1377055.34it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1275465.34it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1296679.39it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1333981.30it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1310126.35it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1289246.00it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1360770.85it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1350605.06it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1229082.06it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1330996.91it/s]
100%|██████████| 20000/20000 [00:00<00:00, 1280351.66it/

In [10]:
# Returns vectorized aggragated texts
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 400, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 400, 'Test')

## Classifier: Model Definition and Validation

In [11]:
# Define MLP Classifier:
## Activation function for the hidden layer: "rectified linear unit function"
## Solver for weight optimization: "stochastic gradient-based optimizer"
## Alpha: regularization parameter
## Learning rate schedule for weight updates: "gradually decreases the learning rate at each time step t using an inverse scaling exponent of power_t"
## Verbose: "True" in order to print progress messages to stdout.
## Early stopping: "True" in order to use early stopping to terminate training when validation score is not improving. It automatically sets aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.

mlpClf = MLPClassifier(solver = 'adam', activation= 'relu' ,alpha = 0.0005, verbose = True, early_stopping = True,
                         learning_rate = 'invscaling', max_iter = 400, random_state = 0)


# Final fit
mlpClf.fit(train_vectors_dbow, y)

Iteration 1, loss = 0.51712106
Validation score: 0.824000
Iteration 2, loss = 0.33308173
Validation score: 0.856000
Iteration 3, loss = 0.29067624
Validation score: 0.856000
Iteration 4, loss = 0.26098065
Validation score: 0.854000
Iteration 5, loss = 0.23977749
Validation score: 0.846000
Iteration 6, loss = 0.22093390
Validation score: 0.842000
Iteration 7, loss = 0.20381316
Validation score: 0.838000
Iteration 8, loss = 0.18785099
Validation score: 0.842000
Iteration 9, loss = 0.17403692
Validation score: 0.844000
Iteration 10, loss = 0.16122524
Validation score: 0.838000
Iteration 11, loss = 0.14877849
Validation score: 0.844000
Iteration 12, loss = 0.13735856
Validation score: 0.848000
Iteration 13, loss = 0.12766286
Validation score: 0.838000
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0005, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='invscaling',
              learning_rate_init=0.001, max_fun=15000, max_iter=400,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=0, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

In [12]:
# predict test and save output
y_score = mlpClf.predict_proba(test_vectors_dbow)[:,1]
np.save("../working/y_testD2V",y_score)

## Appendix

We laso tried an XGB regressor, but these predictions will perform worse when submitting.

In [14]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

my_model1 = XGBRegressor(objective = "reg:logistic",n_estimators=2100, learning_rate=0.01, n_jobs=4,subsample = 0.9,
                       min_child_weight = 1,max_depth=4,gamma=1.5,colsample_bytree=0.6,random_state=0)
my_model1.fit(train_vectors_dbow, y)
y_scoreX = my_model1.predict(test_vectors_dbow)
np.save("../working/y_testD2VX",y_scoreX)