In [1]:
## more common imports
import pandas as pd
import numpy as np
from collections import Counter
import re

# languange processing imports
import nltk
from gensim.corpora import Dictionary
# preprocessing imports
from sklearn.preprocessing import LabelEncoder

# model imports
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
# hyperparameter training imports
from sklearn.model_selection import GridSearchCV

# visualization imports
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline
sns.set()  # defines the style

In [2]:
import math 
import operator
from tqdm import tqdm_notebook as tqdm 

### Definition of function to preprocess comment

In [3]:
# find and remove non-ascii words
# I stored our special word in a variable for later use

our_special_word = 'qwerty'

def remove_ascii_words(df):
    """ removes non-ascii characters from the 'texts' column in df.
    It returns the words containig non-ascii characers.
    """
    non_ascii_words = []
    for i in range(len(df)):
        for word in df.loc[i, 'body'].split(' '):
            if any([ord(character) >= 128 for character in word]):
                non_ascii_words.append(word)
                df.loc[i, 'body'] = df.loc[i, 'body'].replace(word, our_special_word)
    return non_ascii_words


In [4]:
def get_good_tokens(sentence):
    replaced_punctation = list(map(lambda token: re.sub('[^0-9A-Za-z!?]+', '', token), sentence))
    removed_punctation = list(filter(lambda token: token, replaced_punctation))
    return removed_punctation

In [5]:
# Here we get transform the documents into sentences for the word2vecmodel
# we made a function such that later on when we make the submission, we don't need to write duplicate code
def w2v_preprocessing(df):
    """ All the preprocessing steps for word2vec are done in this function.
    All mutations are done on the dataframe itself. So this function returns
    nothing.
    """
    df['body'] = df.body.str.lower()
    df['document_sentences'] = df.body.str.split('.')  # split texts into individual sentences
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(map(nltk.word_tokenize, sentences)),
                                         df.document_sentences))  # tokenize sentences
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(map(get_good_tokens, sentences)),
                                         df.tokenized_sentences))  # remove unwanted characters
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(filter(lambda lst: lst, sentences)),
                                         df.tokenized_sentences))  # remove empty lists



### Load and preprocess train_data and test_data

In [13]:
train_data = pd.read_csv("train_data.csv.gz", compression="gzip", encoding="utf8")

In [14]:
non_ascii_words = remove_ascii_words(train_data)

In [15]:
w2v_preprocessing(train_data)

In [59]:
#create dictionary with all sentences
sentences = []
for sentence_group in train_data.tokenized_sentences:
    sentences.extend(sentence_group)

print("Number of sentences: {}.".format(len(sentences)))
print("Number of texts: {}.".format(len(train_data)))

Number of sentences: 773082.
Number of texts: 296042.


In [6]:
test_data = pd.read_csv("test_data.csv.gz", compression="gzip", encoding="utf8")

(1107946, 4)


Unnamed: 0,author,subreddit,created_utc,body
0,ejchristian86,TwoXChromosomes,1388534000.0,I hadn't ever heard of them before joining thi...
1,ZenDragon,gaming,1388534000.0,"At 7680 by 4320 with 64x AA, right?"
2,savoytruffle,AskReddit,1388534000.0,bite me


In [60]:
sentences2 = []
for sentence_group in test_data.tokenized_sentences:
    sentences2.extend(sentence_group)

print("Number of sentences: {}.".format(len(sentences2)))
print("Number of texts: {}.".format(len(test_data)))

Number of sentences: 2710101.
Number of texts: 1107056.


In [61]:
# Using both train and test data to train w2v
sentences = sentences + sentences2

In [62]:
%%time
# Set values for various parameters
num_features = 200    # Word vector dimensionality
min_word_count = 3    # Minimum word count
num_workers = 8       # Number of threads to run in parallel
context = 6           # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model
# this model is a two-layer neural networks that are trained to reconstruct linguistic contexts of words.
W2Vmodel = Word2Vec(sentences=sentences,
                    sg=1,        # alleno l'algoritmo con lo skip-gram
                    hs=0,        # allenameto fatto con sampling non negativi
                    workers=num_workers,
                    size=num_features,
                    min_count=min_word_count,  #ignoro parole con freq minore dimin_word_count
                    window=context,  #range max delle parole da considerare come contesto.
                    sample=downsampling,
                    negative=5,
                    iter=6)

Wall time: 5min 3s


In [41]:
def get_w2v_features(w2v_model, sentence_group):
    """ Transform a sentence_group (containing multiple lists
    of words) into a feature vector. It averages out all the
    word vectors of the sentence_group.
    """
    words = np.concatenate(sentence_group)  # words in text
    index2word_set = set(w2v_model.wv.vocab.keys())  # words known to model
    
    featureVec = np.zeros(w2v_model.vector_size, dtype="float32")
    
    # Initialize a counter for number of words in a review
    nwords = 0
    # Loop over each word in the comment and, if it is in the model's vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            featureVec = np.add(featureVec, w2v_model[word])
            nwords += 1.

    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec


In [69]:
train_data.body = train_data['body'].apply(str)

In [70]:
non_ascii_words = remove_ascii_words(train_data)

print("Replaced {} words with characters with an ordinal >= 128 in the test data.".format(
    len(non_ascii_words)))

Replaced 1396 words with characters with an ordinal >= 128 in the test data.


In [71]:
w2v_preprocessing(train_data)

In [72]:
train_data.drop(train_data[train_data.tokenized_sentences.str.len() == 0].index, inplace= True) 


In [73]:
train_data['w2v_features'] = list(map(lambda sen_group:
                                     get_w2v_features(W2Vmodel, sen_group),
                                     train_data.tokenized_sentences))

  app.launch_new_instance()


In [74]:
trainw2v = train_data.drop(columns = ['subreddit','created_utc','document_sentences', 'tokenized_sentences'])

In [75]:
trainw2v.to_pickle('TrainW2V.csv')  # train_data vettorizzato

#### vettorizzo anche il test set

In [7]:
test_data.body = test_data['body'].apply(str)

In [8]:
non_ascii_words = remove_ascii_words(test_data)

print("Replaced {} words with characters with an ordinal >= 128 in the test data.".format(
    len(non_ascii_words)))

Replaced 33975 words with characters with an ordinal >= 128 in the test data.


In [9]:
w2v_preprocessing(test_data)

In [77]:
test_data.drop(test_data[test_data.tokenized_sentences.str.len() == 0].index, inplace= True) 

In [78]:
test_data['w2v_features'] = list(map(lambda sen_group:
                                     get_w2v_features(W2Vmodel, sen_group),
                                     test_data.tokenized_sentences))

  app.launch_new_instance()


In [79]:
testw2v = test_data.drop(columns = ['subreddit','created_utc','document_sentences', 'tokenized_sentences'])

In [80]:
testw2v.to_pickle('TestW2V.csv')