In [94]:
from __future__ import absolute_import, division, print_function, unicode_literals

%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from helpers import count_unique_words, count_unique_ngrams, \
            build_unique_ngrams, create_sentence_vectors, create_sentence_vectors_submission

import sys

import tensorflow as tf
from tensorflow import keras

import gensim   # Not sure whether it is better to use gensim or tensorflow :/
import logging
from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from clean_helpers import *

take_full = True
test_locally = True
create_new_text_files = True
ngrams = 1

# Specify here what cleaning functions you want to use
cleaning_options = ['clean_new_line', 'remove_stopwords', 'clean_tags',
                    'clean_punctuation', 'remove_numbers', 'lemmatize', 'remove_saxon_genitive',
                    ]


clean = {
    "clean_new_line": clean_new_line,
    "lowercase": lowercase,
    "lemmatize": lemmatize,
    "remove_stopwords": remove_stopwords,
    "translate": perform_translation,
    "clean_punctuation": clean_punctuation,
    "clean_tags" : clean_tags,
    "remove_numbers": remove_numbers,
    "remove_saxon_genitive": remove_saxon_genitive,
    "gensim_simple": gensim_clean   # not a good idea to use it I think! It cleans everything which is not alphabetic (special char, numbers and so on)
}


# algorithm_used = ""
# algorithm = {
#     "naive_bayes": ,
#     "logistic_regression": ,
#     "svm": ,
#     "lstm":,
#     "fasttext":,
#     "cnn": ,
# }

# options = []
# additional_options = {
#     "count_frequency": ,
#     "count_ngrams": ,
    
# }

In [3]:
input_file_pos = 'Data/train_pos.txt'
if take_full:
    input_file_pos = 'Data/train_pos_full.txt'
  
input_file_neg = 'Data/train_neg.txt'
if take_full:
    input_file_neg = 'Data/train_neg_full.txt'
    
list_of_pos_sentences = []
with open(input_file_pos, 'r') as f:
    for line in f:
        list_of_pos_sentences.append(line)
 
list_of_neg_sentences = []
with open(input_file_neg, 'r') as f:
    for line in f:
        list_of_neg_sentences.append(line)

In [4]:
from data_handling import build_sentences

df = build_sentences(list_of_pos_sentences, list_of_neg_sentences)

print("unique words = {}".format(count_unique_words(df)))

unique words = 592563


In [5]:
# Perform all the cleaning options selected

for clean_option in cleaning_options:
    counter_of_occurrences = 0
    %time df = clean[clean_option](df)
    print(clean_option)
    print(df.head())
    print("unique words = {}".format(count_unique_words(df)))
    print("################################\n\n")
    

df.head()

CPU times: user 4.16 s, sys: 107 ms, total: 4.26 s
Wall time: 4.28 s
clean_new_line
                                            sentence  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2  " <user> just put casper in a box ! " looved t...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
unique words = 592563
################################


The number of scipy stopwords is 179
CPU times: user 9.86 s, sys: 258 ms, total: 10.1 s
Wall time: 10.3 s
remove_stopwords
                                            sentence  label
0  <user> dunno justin read mention . justin god ...      1
1    logic dumb , even crop name photo . tsk . <url>      1
2  " <user> put casper box ! " looved battle ! #c...      1
3  <user> <user> thanks sir > > trip lil mama ......      1
4  visiting brother tmr bestest birthday gift eve...      1
unique words = 5

Unnamed: 0,sentence,label
0,dunno justin read mention justin god know hope...,1
1,logic dumb even crop name photo tsk,1
2,put casper box ! looved battle ! #crakkbitch,1
3,thanks sir trip lil mama ... keep doin ya thang !,1
4,visiting brother tmr bestest birthday gift eve...,1


In [None]:
count_unique_words(df)

In [None]:
count_unique_ngrams(df, ngrams)

In [303]:
ngrams_list = []
for n in range(1, ngrams+1):
    ngrams_list.extend(build_unique_ngrams(df, n))

In [304]:
len(ngrams_list)

832786

In [305]:
counter_ngrams = [0 for i in range(0, ngrams+1)]
for el in ngrams_list:
    for i in range(1, ngrams+1):
        if len(el.split()) == i:
            counter_ngrams[i] += 1
counter_ngrams

[0, 100493, 732293]

In [19]:
if test_locally:
    train_test_split = 0.8
    permut = np.random.permutation(df.shape[0])
    train_x = df.iloc[permut[: int(df.shape[0]*train_test_split)]]['sentence']
    train_y = df.iloc[permut[: int(df.shape[0]*train_test_split)]]['label']
    test_x = df.iloc[permut[int(df.shape[0]*train_test_split): ]]['sentence']
    test_y = df.iloc[permut[int(df.shape[0]*train_test_split): ]]['label']
    
    train_y = train_y.where(train_y == 1, 0) 
    test_y = test_y.where(test_y == 1, 0)

In [20]:
print(train_x[:10])

1204694    dancing stage tyga rap rack city ! word lastni...
1763863                                  well i'm back sleep
1573572    god know grieving thing help paperback story p...
56768                                    sooo love pic ! ! !
2427677                   sitting waiting email ticketek ...
993725     smoke session room b .. girl room friday start...
787029     good morning asking god show favor morning ble...
219660     thank letting u know daily matter count ! cele...
69424                                 rt i'll try sometime !
1653757                  aye good point gcse like month away
Name: sentence, dtype: object


In [8]:
sentences = [row.split() for row in train_x]
len(sentences)

2000000

In [9]:
from collections import defaultdict
word_freq = defaultdict(int)
for s in sentences:
    for i in s:
        word_freq[i] += 1
print(len(word_freq))  # As we can see, the words are less than the original ones.
print(count_unique_words(df.iloc[permut[: int(df.shape[0]*train_test_split)]]))

# The discrepancy among the two might be due to the nan? Actually shouldn't make much difference

462834
462835


In [10]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['!', '...', '?', "i'm", 'rt', 'love', 'u', 'like', '..', 'get']

In [22]:
word_vector_size = 300

In [10]:
logging.root.level = logging.ERROR   # Should reduce logging


w2v_model = Word2Vec(min_count=1,
                     window=10,
                     size=word_vector_size,
                     negative=5,
                     workers=4,
                     sg=1)    ## Careful here: it should work better with sg=1 for big data

In [11]:
w2v_model.build_vocab(sentences, progress_per=100000)

In [12]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=8)

(119500488, 133938192)

In [13]:
w2v_model.save("models/word2vec/300_8_epochs_sg_neg_5_win_10.model")

In [9]:
w2v_model = Word2Vec.load("models/word2vec/300_8_epochs_sg_neg_5_win_10.model")

2019-12-12 13:45:26,765 : INFO : loading Word2Vec object from models/word2vec/300_8_epochs_sg_neg_5_win_10.model
2019-12-12 13:45:27,351 : INFO : loading wv recursively from models/word2vec/300_8_epochs_sg_neg_5_win_10.model.wv.* with mmap=None
2019-12-12 13:45:27,351 : INFO : loading vectors from models/word2vec/300_8_epochs_sg_neg_5_win_10.model.wv.vectors.npy with mmap=None
2019-12-12 13:45:28,643 : INFO : setting ignored attribute vectors_norm to None
2019-12-12 13:45:28,644 : INFO : loading vocabulary recursively from models/word2vec/300_8_epochs_sg_neg_5_win_10.model.vocabulary.* with mmap=None
2019-12-12 13:45:28,649 : INFO : loading trainables recursively from models/word2vec/300_8_epochs_sg_neg_5_win_10.model.trainables.* with mmap=None
2019-12-12 13:45:28,653 : INFO : loading syn1neg from models/word2vec/300_8_epochs_sg_neg_5_win_10.model.trainables.syn1neg.npy with mmap=None
2019-12-12 13:45:29,798 : INFO : setting ignored attribute cum_table to None
2019-12-12 13:45:29,798 

In [284]:
len(w2v_model.wv.vocab)

87485

In [82]:
w2v_model.wv.most_similar("italy")

[("#i'mabelieber", 0.6168533563613892),
 ('france', 0.5785192251205444),
 ('1dx', 0.5747607350349426),
 ('ciaooo', 0.5639715790748596),
 ('ahhaahah', 0.5627542734146118),
 ('bulgaria', 0.5615280866622925),
 ('1dxxx', 0.5610278248786926),
 ('portugal', 0.5598273873329163),
 ('estonia', 0.5575746297836304),
 ('lithuania', 0.5546209812164307)]

In [24]:
%%time
# Here we create the sentences, by averaging the word vectors in each sentence.
sentence_train_x, sentence_train_y = create_sentence_vectors(train_x[:500000], train_y[:500000], word_vector_size, w2v_model)
sentence_test_x, sentence_test_y = create_sentence_vectors(test_x[:500000], test_y[:500000], word_vector_size, w2v_model)

CPU times: user 42.4 s, sys: 4.76 s, total: 47.2 s
Wall time: 47.5 s


In [35]:
%reset_selective sentences

Once deleted, variables cannot be recovered. Proceed (y/[n])?   y


In [37]:
print(sentence_train_x.shape)
print(sentence_train_y.shape)
print(sentence_test_x.shape)
print(sentence_test_y.shape)
# print(sentence_train_x[:2])
print(sentence_train_y[:10])

(499117, 300)
(499117, 2)
(499157, 300)
(499157, 2)
[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]


In [38]:
# now perform training on the new features vectors.

# Build a "deep" neural network with 2 hidden layers. When we see that it somehow works,
# we can start doing some cross validation on it.

for layer_size in range(15, 31, 5):
    
    print("\n\nStarting with number of layers: {}".format(layer_size))
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(sentence_train_x.shape[1],)),   # the input shape is the number of words in the bow dictionary
        keras.layers.Dense(layer_size, activation='relu'),
        keras.layers.Dense(layer_size, activation='relu'),
        keras.layers.Dense(2, activation='softmax')   # Only 0 and 1
    ])
    model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    overfitting_occurrences = 0
    best_accuracy = 0
    for iteration in range(50):
        
        model.fit(x=sentence_train_x,
                  y=sentence_train_y,
                  validation_data=(sentence_test_x,  sentence_test_y),
                  epochs=1, use_multiprocessing=True)
        # evaluate the test error
        acc = model.evaluate(sentence_test_x,  sentence_test_y, verbose=2)[1]
        if acc < best_accuracy:
            overfitting_occurrences += 1
            if overfitting_occurrences > 2:
                print("###############################")
                print("###############################")
                print("Overfitting, best accuracy with {}".format(best_accuracy))
                break
        else:
            overfitting_occurrences = 0
            best_accuracy = acc

            # save the model otherwise
            model.save("models/model_sentence_rep_small_{}.model".format(layer_size))




Starting with number of layers: 15
 - 10s - loss: 0.4289 - acc: 0.7932
 - 9s - loss: 0.4231 - acc: 0.7977
 - 10s - loss: 0.4192 - acc: 0.7998
 - 9s - loss: 0.4156 - acc: 0.8015
 - 9s - loss: 0.4125 - acc: 0.8031
 - 9s - loss: 0.4127 - acc: 0.8032
 - 9s - loss: 0.4198 - acc: 0.7990
 - 10s - loss: 0.4120 - acc: 0.8039
 - 9s - loss: 0.4117 - acc: 0.8043
 - 9s - loss: 0.4125 - acc: 0.8030
 - 9s - loss: 0.4124 - acc: 0.8030
 - 9s - loss: 0.4120 - acc: 0.8039
###############################
###############################
Overfitting, best accuracy with 0.8042579889297485


Starting with number of layers: 20
 - 10s - loss: 0.4251 - acc: 0.7963
 - 10s - loss: 0.4186 - acc: 0.8000
 - 10s - loss: 0.4138 - acc: 0.8026
 - 10s - loss: 0.4159 - acc: 0.8012
 - 10s - loss: 0.4173 - acc: 0.8007
 - 10s - loss: 0.4112 - acc: 0.8036
 - 10s - loss: 0.4112 - acc: 0.8047
 - 10s - loss: 0.4126 - acc: 0.8041
 - 10s - loss: 0.4118 - acc: 0.8038
 - 10s - loss: 0.4088 - acc: 0.8059
 - 10s - loss: 0.4092 - acc:

In [55]:
model_star = keras.Sequential([
        keras.layers.InputLayer(input_shape=(sentence_train_x.shape[1],)),   # the input shape is the number of words in the bow dictionary
        keras.layers.Dense(layer_size, activation='relu'),
        keras.layers.Dense(layer_size, activation='relu'),
        keras.layers.Dense(2, activation='softmax')   # Only 0 and 1
    ])

model_star.load_weights('models/model_sentence_rep_small_30.model')

model_star.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [56]:
print(model_star)

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7ffa408c1610>


In [89]:
model_star.evaluate(sentence_test_x,  sentence_test_y, verbose=2)[1]
model_star.predict(sentence_test_x)

 - 11s - loss: 0.4045 - acc: 0.8079


array([[4.0681679e-02, 9.5931840e-01],
       [9.9999571e-01, 4.2639813e-06],
       [7.2958276e-02, 9.2704177e-01],
       ...,
       [3.4315959e-01, 6.5684032e-01],
       [6.1612552e-01, 3.8387451e-01],
       [6.4644866e-02, 9.3535513e-01]], dtype=float32)

In [90]:
# We have to get the test dataset and clean it as we have done with the training dataset
df_test = []
with open("Data/test_data.txt", 'r') as f:
    for l in f:
        id_ = l.split(",")[0]
        # it is a csv, but you have to keep other commas (only the first one is relevant)
        sentence = ",".join(l.split(",")[1:])
        df_test.append({
            "label": int(id_),
            "sentence": sentence
        })
df_test = pd.DataFrame(df_test)
df_test.head()

Unnamed: 0,label,sentence
0,1,sea doo pro sea scooter ( sports with the port...
1,2,<user> shucks well i work all week so now i ca...
2,3,i cant stay away from bug thats my baby\n
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...
4,5,"whenever i fall asleep watching the tv , i alw..."


In [91]:
for clean_option in cleaning_options:
        df_test = clean[clean_option](df_test)
        print(clean_option)
        print(df_test.head())
        print("################################\n\n")
    

clean_new_line
                                            sentence  label
0  sea doo pro sea scooter ( sports with the port...      1
1  <user> shucks well i work all week so now i ca...      2
2            i cant stay away from bug thats my baby      3
3  <user> no ma'am ! ! ! lol im perfectly fine an...      4
4  whenever i fall asleep watching the tv , i alw...      5
################################


The number of scipy stopwords is 179
remove_stopwords
                                            sentence  label
0  sea doo pro sea scooter ( sports portable sea-...      1
1  <user> shucks well work week can't come cheer ...      2
2                      cant stay away bug thats baby      3
3  <user> ma'am ! ! ! lol im perfectly fine conta...      4
4  whenever fall asleep watching tv , always wake...      5
################################


clean_tags
                                            sentence  label
0  sea doo pro sea scooter ( sports portable sea-...      1
1  shucks 

In [97]:
sentence_submission_x = create_sentence_vectors_submission(df_test['sentence'],
                                                           word_vector_size,
                                                           w2v_model)

the number of zero sentences (the sentences which have 0 words in our vocabulary) is 22


In [98]:
model_star.predict(sentence_submission_x)

array([[0.96396494, 0.03603501],
       [0.6632302 , 0.33676985],
       [0.7554848 , 0.24451518],
       ...,
       [0.9986198 , 0.00138015],
       [0.05644046, 0.9435596 ],
       [0.9798126 , 0.02018739]], dtype=float32)

In [101]:
predictions = []
for el in model_star.predict(sentence_submission_x):
    predictions.append(-1 if el[0] > el[1] else 1)

print(predictions[:10])

results = pd.DataFrame({
    "Id": df_test['label'],
    "Prediction": predictions
})

results.head(20)

[-1, -1, -1, 1, -1, -1, -1, 1, 1, 1]


Unnamed: 0,Id,Prediction
0,1,-1
1,2,-1
2,3,-1
3,4,1
4,5,-1
5,6,-1
6,7,-1
7,8,1
8,9,1
9,10,1


In [102]:
results.to_csv('Submission.csv', index=False)