In [63]:
from __future__ import absolute_import, division, print_function, unicode_literals

%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from helpers import count_unique_words, count_unique_ngrams

import sys

import tensorflow as tf
from tensorflow import keras

sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [55]:
from clean_helpers import *

take_full = False
test_locally = True
create_new_text_files = True

# Specify here what cleaning functions you want to use
cleaning_options = ['clean_new_line', 'remove_stopwords', 'clean_tags',
                    'clean_punctuation', 'remove_numbers', 'lemmatize', 'remove_saxon_genitive']


clean = {
    "clean_new_line": clean_new_line,
    "lowercase": lowercase,
    "lemmatize": lemmatize,
    "remove_stopwords": remove_stopwords,
    "translate": perform_translation,
    "clean_punctuation": clean_punctuation,
    "clean_tags" : clean_tags,
    "remove_numbers": remove_numbers,
    "remove_saxon_genitive": remove_saxon_genitive
}


# algorithm_used = ""
# algorithm = {
#     "naive_bayes": ,
#     "logistic_regression": ,
#     "svm": ,
#     "lstm":,
#     "fasttext":,
#     "cnn": ,
# }

# options = []
# additional_options = {
#     "count_frequency": ,
#     "count_ngrams": ,
    
# }

In [56]:
input_file_pos = 'Data/train_pos.txt'
if take_full:
    input_file_pos = 'Data/train_pos_full.txt'
  
input_file_neg = 'Data/train_neg.txt'
if take_full:
    input_file_neg = 'Data/train_neg_full.txt'
    
list_of_pos_sentences = []
with open(input_file_pos, 'r') as f:
    for line in f:
        list_of_pos_sentences.append(line)
 
list_of_neg_sentences = []
with open(input_file_neg, 'r') as f:
    for line in f:
        list_of_neg_sentences.append(line)

In [57]:
from data_handling import build_sentences

df = build_sentences(list_of_pos_sentences, list_of_neg_sentences)

print("unique words = {}".format(count_unique_words(df)))

unique words = 125642


In [58]:
# Perform all the cleaning options selected

for clean_option in cleaning_options:
    counter_of_occurrences = 0
    %time df = clean[clean_option](df)
    print(clean_option)
    print(df.head())
    print("unique words = {}".format(count_unique_words(df)))
    print("################################\n\n")
    

df.head()

CPU times: user 252 ms, sys: 7.99 ms, total: 260 ms
Wall time: 260 ms
clean_new_line
                                            sentence  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2  " <user> just put casper in a box ! " looved t...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
unique words = 114427
################################


179
CPU times: user 591 ms, sys: 15.9 ms, total: 606 ms
Wall time: 606 ms
remove_stopwords
                                            sentence  label
0  <user> dunno justin read mention . justin god ...      1
1    logic dumb , even crop name photo . tsk . <url>      1
2  " <user> put casper box ! " looved battle ! #c...      1
3  <user> <user> thanks sir > > trip lil mama ......      1
4  visiting brother tmr bestest birthday gift eve...      1
unique words = 114257
#########################

Unnamed: 0,sentence,label
0,dunno justin read mention justin god know hope...,1
1,logic dumb even crop name photo tsk,1
2,put casper box ! looved battle ! #crakkbitch,1
3,thanks sir trip lil mama .. keep doin ya thang !,1
4,visiting brother tmr bestest birthday gift eve...,1


In [9]:
df_copy = df.copy()

df_copy['word'] = df_copy.sentence.apply(lambda x: x.split((" ")))

df_copy = df_copy.drop("sentence", axis=1)

df_exploded = df_copy.explode("word").reindex()

df_exploded = df_exploded.reset_index()

df_grouped = df_exploded.groupby("word").count().sort_values(by='index', ascending=False).reset_index()

df_non_alpha = df_grouped[df_grouped['word'].apply(lambda x: not x.isalpha())]

df_non_alpha.head(20)

Unnamed: 0,word,index,label
0,!,83074,83074
1,..,40967,40967
2,?,26418,26418
3,i'm,13656,13656
8,.,9456,9456
13,&,7799,7799
29,can't,4905,4905
33,*,4519,4519
37,<3,4361,4361
51,i'll,3205,3205


In [10]:
count_unique_words(df)

103170

In [11]:
count_unique_ngrams(df, 3)

1023297

In [13]:
from helpers import create_labelled_file
k_folds = 5

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

if test_locally:    
    # Create the bag of words
    # The token_pattern is used in order to avoid the preprocessor to remove special characters (like smiles)
    # or hashtags (Twitter!)
    vectorizer = CountVectorizer(token_pattern = '[a-zA-Z0-9$&+,:;=?@#|<>.^*()%!-]+')  # The vectorizer is used to create the bag of words

    %time X = vectorizer.fit_transform(df['sentence'])
    Y = df['label']

In [61]:
counter = 0
print(X.shape)
for x in X[0].toarray()[0]:
    if x > 0:
        counter += 1
print(counter)
print(df.iloc[0].sentence)
print(vectorizer.vocabulary_.get('#crakkbitch'))

(200000, 102408)
9
dunno justin read mention justin god know hope follow #believe
2723


In [97]:
# divide in train test split
# CAREFUL HERE: when working with neural nets, we need to convert -1, 1 labels into 0, 1
if test_locally:
    train_test_split = 0.7
    permut = np.random.permutation(X.shape[0])
    train_x = X[permut[: int(X.shape[0]*train_test_split)]]
    train_y = Y[permut[: int(X.shape[0]*train_test_split)]]
    
    test_x = X[permut[int(X.shape[0]*train_test_split):]]
    test_y = Y[permut[int(X.shape[0]*train_test_split):]]
    
    ## Convert all -1 into 0!
    train_y = train_y.where(train_y == 1, 0) 
    test_y = test_y.where(test_y == 1, 0)
    
    print(train_x.shape)
    print(test_x.shape)
    print(train_y)

(140000, 102408)
(60000, 102408)
112683    0
144413    0
70056     1
128329    0
184592    0
         ..
106638    0
56853     1
63980     1
62727     1
138761    0
Name: label, Length: 140000, dtype: int64


In [101]:
model = keras.Sequential([
    keras.layers.InputLayer(input_shape=(train_x.shape[1],)),   # the input shape is the number of words in the bow dictionary
    keras.layers.Dense(50, activation='relu'),
    keras.layers.Dense(2, activation='softmax')   # Only 0 and 1
])

In [102]:
model.compile(optimizer='sgd',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
for iter in range(5):
    # train for 5 epochs the model 
    model.fit(train_x, train_y, epochs=5)
    # evaluate the test error
    model.evaluate(test_x,  test_y, verbose=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 - 13s - loss: 0.4465 - acc: 0.7865
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 - 13s - loss: 0.4295 - acc: 0.7947
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 - 13s - loss: 0.4227 - acc: 0.7994
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [96]:
test_loss, test_acc = model.evaluate(test_x,  test_y, verbose=2)

 - 13s - loss: 0.4323 - acc: 0.7965


In [None]:
if test_locally:
    # Do cross validation on the bag of words, using the neural network
    df_precisions = {}
    for epochs in tqdm(range(10, 20, 2)):
        precisions = []
        for k in range(k_folds):
            
        df_precisions[epochs] = precisions
        print(np.array(precisions).mean())