In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from helpers import count_unique_words, count_unique_ngrams, build_unique_ngrams

import sys

import tensorflow as tf
from tensorflow import keras

import gensim   # Not sure whether it is better to use gensim or tensorflow :/
import logging
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

sys.path.append('../')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [19]:
from clean_helpers import *

take_full = False
test_locally = True
create_new_text_files = True
ngrams = 2

# Specify here what cleaning functions you want to use
cleaning_options = ['clean_new_line', 'remove_stopwords', 'clean_tags',
                    'clean_punctuation', 'remove_numbers', 'lemmatize', 'remove_saxon_genitive',
                    ]


clean = {
    "clean_new_line": clean_new_line,
    "lowercase": lowercase,
    "lemmatize": lemmatize,
    "remove_stopwords": remove_stopwords,
    "translate": perform_translation,
    "clean_punctuation": clean_punctuation,
    "clean_tags" : clean_tags,
    "remove_numbers": remove_numbers,
    "remove_saxon_genitive": remove_saxon_genitive,
    "gensim_simple": gensim_clean   # not a good idea to use it I think! It cleans everything which is not alphabetic (special char, numbers and so on)
}


# algorithm_used = ""
# algorithm = {
#     "naive_bayes": ,
#     "logistic_regression": ,
#     "svm": ,
#     "lstm":,
#     "fasttext":,
#     "cnn": ,
# }

# options = []
# additional_options = {
#     "count_frequency": ,
#     "count_ngrams": ,
    
# }

In [4]:
input_file_pos = 'Data/train_pos.txt'
if take_full:
    input_file_pos = 'Data/train_pos_full.txt'
  
input_file_neg = 'Data/train_neg.txt'
if take_full:
    input_file_neg = 'Data/train_neg_full.txt'
    
list_of_pos_sentences = []
with open(input_file_pos, 'r') as f:
    for line in f:
        list_of_pos_sentences.append(line)
 
list_of_neg_sentences = []
with open(input_file_neg, 'r') as f:
    for line in f:
        list_of_neg_sentences.append(line)

In [5]:
from data_handling import build_sentences

df = build_sentences(list_of_pos_sentences, list_of_neg_sentences)

print("unique words = {}".format(count_unique_words(df)))

unique words = 114427


In [6]:
# Perform all the cleaning options selected

for clean_option in cleaning_options:
    counter_of_occurrences = 0
    %time df = clean[clean_option](df)
    print(clean_option)
    print(df.head())
    print("unique words = {}".format(count_unique_words(df)))
    print("################################\n\n")
    

df.head()

CPU times: user 257 ms, sys: 7.95 ms, total: 264 ms
Wall time: 264 ms
clean_new_line
                                            sentence  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2  " <user> just put casper in a box ! " looved t...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
unique words = 114427
################################


The number of scipy stopwords is 179
CPU times: user 622 ms, sys: 3.63 ms, total: 626 ms
Wall time: 626 ms
remove_stopwords
                                            sentence  label
0  <user> dunno justin read mention . justin god ...      1
1    logic dumb , even crop name photo . tsk . <url>      1
2  " <user> put casper box ! " looved battle ! #c...      1
3  <user> <user> thanks sir > > trip lil mama ......      1
4  visiting brother tmr bestest birthday gift eve...      1
unique words =

Unnamed: 0,sentence,label
0,dunno justin read mention justin god know hope...,1
1,logic dumb even crop name photo tsk,1
2,put casper box ! looved battle ! #crakkbitch,1
3,thanks sir trip lil mama ... keep doin ya thang !,1
4,visiting brother tmr bestest birthday gift eve...,1


In [8]:
count_unique_words(df)

100499

In [9]:
count_unique_ngrams(df, ngrams)

734350

In [33]:
ngrams_list = []
for n in range(1, ngrams+1):
    ngrams_list.extend(build_unique_ngrams(df, n))

In [34]:
len(ngrams_list)

834847

In [37]:
counter_ngrams = [0 for i in range(0, ngrams+1)]
for el in ngrams_list:
    for i in range(1, ngrams+1):
        if len(el.split()) == i:
            counter_ngrams[i] += 1
counter_ngrams

[0, 100498, 734349]

In [None]:
if test_locally:
    train_test_split = 0.7
    permut = np.random.permutation(df.shape[0])
    train_x = df.iloc[permut[: int(df.shape[0]*train_test_split)]]['sentence']
    train_y = df.iloc[permut[: int(df.shape[0]*train_test_split)]]['label']
    test_x = df.iloc[permut[int(df.shape[0]*train_test_split): ]]['sentence']
    test_y = df.iloc[permut[int(df.shape[0]*train_test_split): ]]['label']

In [None]:
model_w2v = gensim.models.Word2Vec(train_x, size=150, window=10, min_count=1, workers=8, iter=10,
                                          sg=1)  # sg is for skip gram

model_w2v.wv['computer']
model_w2v.wv['airplane']
model_w2v.wv['twitter']

model_w2v.wv['computer'].most_similar()


In [None]:
for i in range (10, 100, 10):
    for j in range(1, 10, 1):
        # i iterates over the iterations for the neural net
        # j iterates over the iterations for the word2vec model
        model_w2v = gensim.models.Word2Vec(train_x, size=150, window=10, min_count=1, workers=8, iter=j,
                                          sg=1)  # sg is for skip gram
        
        
        
        print("test with {} hidden layers".format(i))
        model = keras.Sequential([
            keras.layers.InputLayer(input_shape=(train_x.shape[1],)),   # the input shape is the number of words in the bow dictionary
            keras.layers.Dense(i, activation='sigmoid'),
            keras.layers.Dense(2, activation='softmax')   # Only 0 and 1
        ])
        model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
        best_accuracy = 0
        for iteration in range(20):
            model.fit(train_x, train_y, epochs=1, batch_size=16)
            # evaluate the test error
            if not use_partial_test:
                acc = model.evaluate(test_x,  test_y, verbose=2)[1]
            else:
                permut_test = np.random.permutation(len(test_x))
                acc = model.evaluate(test_x[permut_test[:10000]], test_y[permut_test[:10000]])

            if acc < best_accuracy:
                print("Overfitting, best with iter = {}".format(iteration))
                print("#################################\n#################################")
                break
            best_accuracy = acc

            # save the model otherwise
            %time model.save("models/model_small_{}.model".format(i))


