# Preprocessing + pretrained GloVe embeddings + 6 LSTM models + XGboost Classifier

In [None]:
pip install nltk 

In [None]:
import nltk
nltk.download('words')

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
pip install wordninja

In [None]:
!pip install contractions

In [None]:
#!apt install -qq enchant #if it's necessary
!pip install pyenchant

In [None]:
!pip install emot

In [None]:
#Basic Libraries
import pandas as pd
import numpy as np 
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import spacy


#Sklearn library
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import sklearn.metrics as metrics

#xgboost
from xgboost import XGBClassifier

#Build the LSTM model
import tensorflow as tf
import pickle as cPickle

from tensorflow import keras
from keras.preprocessing.text import  Tokenizer
from keras.utils import pad_sequences

#for expansion the contractions for instance I'll or I've been to I will and I have been and not only 
import contractions 

#import emoticons in order to replace them with appropriate words
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

#read html files 
import requests

#split the 
import wordninja

In [None]:

url_positive = "https://ptrckprry.com/course/ssd/data/positive-words.txt"
rsp = requests.get(url_positive)
lines = rsp.text.strip("\n").split("\n")
positive_words = lines[lines.index('a+'):]

url_negative = "https://ptrckprry.com/course/ssd/data/negative-words.txt"
rsp = requests.get(url_negative)
lines = rsp.text.strip("\n").split("\n")
negative_words = lines[lines.index('2-faced'):]


In [None]:
def expansion_patterns(text):

    expansion_patterns = [(' nd ',' and '),(' wa ',' was '),(' donnow ',' do not know '),(' i\'ts ','it is '),
                      (' dem ',' them '),(' #+ha+ha ',' haha '),(' i\'ts ','it is '),(' i\'ts ','it is '),(' n+a+h+ ', ' no '),
                      (' n+a+ ', ' no '),(' w+o+w+', 'wow '),('y+a+y+', 'yay'),('y+[e,a]+s+', 'yes'),
                      (' ya ', ' you '),('n+o+', 'no'),('a+h+','ah'),('muah','kiss'),(' y+u+p+ ', ' yes '),(' y+e+p+ ', ' yes '),
                      (' ima ', ' i am going to '),(' woah ', ' wow '),(' wo ', ' wow '),(' aw ', ' cute '), 
                      (' lmao ', ' haha '),(' lol ', ' haha ')]
    patterns = [(re.compile(regex_exp, re.IGNORECASE), replacement) for (regex_exp, replacement) in expansion_patterns]
    for (pattern, replacement) in patterns:
        (text, _) = re.subn(pattern, replacement, text)
    return text

In [None]:

#A more robust preprocessing phase 
#import the stopword list from the spacy library 
 
EMOTICONS_EMO[':d'] = 'laughing '
EMOTICONS_EMO['<3'] = 'red heart'

sp = spacy.load('en_core_web_sm')
spacy_stopwords = sp.Defaults.stop_words
stopwords_dict = Counter(spacy_stopwords)
lemmatizer =WordNetLemmatizer()

COUNT = 0

def increment():
    global COUNT
    COUNT = COUNT+1


#cleaning "pipeline"  
def clean_data(text, stopwords, lemmatization):

      #TODO
      #map the emoji to lexicon 
      #for instance if we have 'text text <3 :d text :D' will be   'text text red heart positive laughing positive text Laughing'
      text = ' '.join(EMOTICONS_EMO.get(word) if word in EMOTICONS_EMO.keys() else word for word in text.split() ) 
      #print('emoji done')

      #perform casefolding
      text =text.casefold()
      #print('casefold done')

      #remove punctuations for each twitter
      text = ' '.join(text_ for text_ in text.split() if text_ not in string.punctuation)
      #print('punctuations done')

      #remove all numbers not just digits since doesn't give so much information for the purpose of sentimental analysis
      #for instance '#5words 625' with be '#words and then will be "words" after removing the hashtags in the later phase of preprocessing
      
      text = ' '.join(re.sub('(\d+(\.\d+)?)','',word) if re.search('(\d+(\.\d+)?)',word) else word for word in text.split() ).strip()
      #print('remove numbers done')

      #remove different tags for instance "<user>,<url>" for each twitter
      text = re.sub('<[^<]+?>','', text)
      #print('remove different tags done')

      #remove multiply commas and dots everywhere in tweets      
      text = re.sub('\.|,*','', text)
      #print('remove multiply commas done')

      #expansion patterns
      text=expansion_patterns(text)
      #print('remove expansion patterns')

      #contractions from the library "coz"(because) is too powerful for instance coz to because or I'll to I will or don't to do not or even dont to do not 
      text = ' '.join(contractions.fix(text_) for text_ in text.split() ) 
      #print('contractions done')

      if stopwords:
          #remove the stopwords
          text = ' '.join([word for word in text.split() if word not in stopwords_dict])         
      #print('stopwords done')
      
      #split th words within a hashtags , if it's unsplittable we will remove the word i.e if #happythoughts with be happy thoughts using the library compound word splitter 
      #for instance the tweet 
      #<user> hahahhahaha aw dont cry #thinkhappythoughts .. yeah right #cryyoureffingeyesout will be
      #<user> hahahhahaha aw dont cry think happy thoughts .. yeah right cry your effing eyes out
      
      text=' '.join( ' '.join(wordninja.split(word_[1:])) if word_.startswith('#') else word_ for word_ in text.split() )
      #print('split hashtag done')

      if lemmatization :
          #perform lemmatization
          text = ' '.join(lemmatizer.lemmatize(text_)  for text_ in text.split() )
      #print('lemmatization done')

      # use the positive and negative sentimental analysis adding the appropriate words in each tweet using predefined dictionaries/vocabularies
      #https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#datasets we used the list of postive and negative lexicon word to add postive and negative token to our tweets
      
      text = ' '.join(word+" positive" if word in positive_words else word for word in text.split() )
      text = ' '.join(word+" negative" if word in negative_words else word for word in text.split() )
      #print('postive done')

      #remove the tokens length less than 2 again if some may appear after the above preprocessing
      text = ' '.join(text_ for text_ in text.split() if len(text_)>2)
      #print('#remove the tokens done')


      increment()
      if(COUNT%10000==0):
        print(COUNT)
      return  text.strip()


#Load the data and run the preprocessor pipeline 
class Preprocessor:
    def __init__(self):
        """Init function
        """
    def load_data(preprocessed=True,Train_data=True):
        DIRECTORY1 = "../data/train_pos.txt"
        DIRECTORY2 = "../data/train_neg.txt"

        
        #import the data

        if Train_data==True:
          pos_data = pd.read_fwf(DIRECTORY1, header=None, names=["tweets"])
          pos_data["label"] = 1.0

          neg_data = pd.read_fwf(DIRECTORY2, header=None, names=["tweets"])
          neg_data["label"] = 0.0
          data = pd.concat([pos_data, neg_data], ignore_index=True)
          np.random.seed(500)
          #shuffle the merge data
          data = data.iloc[np.random.permutation(len(data))]

          #print(pos_data.isnull().any(axis=1))
        else:
          with open('../data/test_data.txt') as f:
            data = f.readlines()
          data = pd.DataFrame(data,columns=['tweets'])
          data.tweets=data.tweets.apply(lambda x :x[x.find(',')+1:])
        #data.dropna(subset = ["tweets"], inplace=True)
        data['tweets']=data['tweets'].apply(lambda x : clean_data(x, stopwords=True,lemmatization=True))
        
        #remove empty lines if any  
        #data.dropna(subset = ["tweets"], inplace=True)

        #X = data['tweets'].values
        #y = np.stack((data['positive'],data['negative']),axis=-1)

        return data#np.array(X), np.array(y)

In [None]:

X_test = Preprocessor.load_data(preprocessed=True,Train_data=False)
#it takes ~1 minute to run


#powerful preprocessing
#,... currently workn out ... <user> park ) #anycompany ? 
#currently workn park company

#OR
#3417,<user> loool 7yaaatii mbyn alejtehaad hhh :p p mnn jddd <3 3 , abshrk ana b3d praise is due to allah =) ) :p p
#loool yaaatii mbyn alejtehaad hhh mnn jddd red heart positive positive abshrk ana praise positive allah happy positive face smiley

In [None]:
X_test

In [None]:
X_full = Preprocessor.load_data(preprocessed=True,Train_data=True)
#it takes ~2 hour to complete the preprocessing phase in the entire 2.5M tweets

In [None]:
#so we run it in two different accounts in colab and then combine the two preprocessed sets 
#X_positive = pd.read_csv('/content/drive/MyDrive/X_positive_preprocessed.csv')
#X_negative = pd.read_csv('/content/drive/MyDrive/X_negative_preprocessed.csv')
#X_full_preprocessed = pd.concat([X_positive, X_negative], ignore_index=True)
#np.random.seed(500)
#shuffle the merge data
#X_full_preprocessed = X_full_preprocessed.iloc[np.random.permutation(len(X_full_preprocessed))]

In [None]:
#X_full_preprocessed.tweets=X_full_preprocessed.tweets.astype(str)

In [None]:
X_full_preprocessed

In [None]:
y_train=X_full_preprocessed.label

In [None]:
tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(X_full_preprocessed.tweets)

X_train = tokenizer.texts_to_sequences(X_full_preprocessed.tweets)#convert each word to a integer based on the tokenizer

In [None]:
vocab_size=len(tokenizer.word_index)+1 
max_len=40 # maxh length of each tweet is set to 40 words s(so will be performed padding and truncation )
X_train = pad_sequences(X_train, padding='post'  ,maxlen=max_len)

In [None]:
X_test = tokenizer.texts_to_sequences(X_test.tweets)#covert each word to a integer based on the tokenizer

In [None]:
X_test = pad_sequences(X_test, padding='post'  ,maxlen=max_len)

In [None]:
#X_train

In [None]:
#X_test

**USE THE PRETRAINED GLVOE EMBEDDINGS FOR OUR TEXT TO VECTOR REPRESENTATIONS SINCE IS MORE POWERFUL**

In [None]:

#retrieve the pretrained embeddings and store them as a dictionary
embeddings_index = {}
#f = open('/content/drive/MyDrive/glove.twitter.27B.200d.txt', encoding='utf-8')
f = open('../data/glove.twitter.27B.200d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


In [None]:
#embeddings_index.get('luka') # for instance

In [None]:

#form our embedding matrix for each word that appears in our dataset based on pretrained glove embeddings
embedding_matrix = np.zeros((vocab_size , 200))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)
#cPickle.dump([embedding_matrix],open('/content/drive/MyDrive/embedding_matrix_full_preprocessing.dat', 'wb'))#write the embedding_matrix in file


In [None]:
#embedding_matrix

**TRAIN A GLOVE MODEL FOR THE EMBEDDINGS ON OUR CORPUS**

In [None]:
"""
# instantiate the corpus
corpus = Corpus() 
# this will create the word co occurence matrix 
corpus.fit(X_200.tweets, window=1000)

# instantiate the model
glove_model = Glove(no_components=200, learning_rate=0.1)

# and fit over the corpus matrix
glove_model.fit(corpus.matrix, epochs=20, no_threads=32)

# finally we add the vocabulary to the model
glove_model.add_dictionary(corpus.dictionary)
"""

In [None]:
"""
#form our embedding matrix for each word that appears in our dataset based on trained glove embeddings on our corpus
embedding_matrix = np.zeros((max_features + 1, 200))
for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = glove_model.word_vectors[glove_model.dictionary[word]]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
           
"""


**DIFFERENT NEURAL NETWORK MODELS**

In [None]:
from keras.layers import Dense
from keras.layers import Flatten
from keras.models import Sequential
from keras.metrics import Precision, Recall
from keras.layers import Embedding, SpatialDropout1D , Conv1D
from keras.layers import Bidirectional, LSTM, Dense, Dropout,Masking,Activation
from keras.optimizers import RMSprop

import tensorflow as tf
from tensorflow.keras.optimizers import Adam  # SGD, RMSprop

In [None]:
#MODEL 1 
#vocab_length=max_features+1
embedding_size=200
num_of_words=40 # train_sequences.shape[1]

#first model : simple neural network
model1 = Sequential()
embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_len , trainable=False,mask_zero=True) #trainable set to False bc we use the downloaded dict
model1.add(embedding_layer)
model1.add(Masking(mask_value=0.0)) #need masking layer to not train on padding (so for that words whicha weren't in the pretrained glove embeddings so their representation is full of zeros)
model1.add(Bidirectional(LSTM(512)))
model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(1))
model1.add(Activation('sigmoid'))

model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model1.summary())

In [None]:
#tf.keras.utils.plot_model(model1, show_shapes=True)

In [None]:
EPOCHS=6
BATCH_SIZE=1024

model1.fit(X_train,y_train , batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.1)
#train the model

In [None]:
#model1.save('/content/drive/MyDrive/model_1',save_format="h5")

**MODEL 2**

In [None]:
#MODEL 2 # has potential for more accurate predictions
#vocab_length=max_features+1
embedding_size=200 
num_of_words=40 # train_sequences.shape[1]

#first model : simple neural network
model2 = Sequential()
embedding_layer2 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_len , trainable=False) #trainable set to False bc we use the downloaded dict
model2.add(embedding_layer2)
model2.add(LSTM(100))
model2.add(Dense(64))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model2.summary())

In [None]:
#tf.keras.utils.plot_model(model2, show_shapes=True)

In [None]:
EPOCHS=6
BATCH_SIZE=128

model2.fit(X_train,y_train , batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.1)
#train the model

In [None]:
#model2.save('/content/drive/MyDrive/model_2',save_format="h5")

**MODEL 3**

In [None]:
#MODEL 3
#vocab_length=max_features+1
embedding_size=200
num_of_words=40 # train_sequences.shape[1]

#first model : simple neural network
model3 = Sequential()
embedding_layer3 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_len , trainable=False) #trainable set to False bc we use the downloaded dict
model3.add(embedding_layer3)
model3.add(LSTM(1024))
model3.add(Dropout(0.4))
model3.add(Dense(512, activation='relu'))
model3.add(Dropout(0.4))
model3.add(Dense(512,activation='relu'))
model3.add(Dropout(0.4))
model3.add(Dense(512,activation='relu'))

model3.add(Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model3.summary())

In [None]:
#tf.keras.utils.plot_model(model3, show_shapes=True)

In [None]:
EPOCHS=6
BATCH_SIZE=128

model3.fit(X_train,y_train , batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.1)
#train the model

In [None]:
#model3.save('/content/drive/MyDrive/model_3',save_format="h5")

**MODEL 4**

In [None]:
#MODEL 4

#vocab_length=max_features+1
embedding_size=200
num_of_words=40 # train_sequences.shape[1]

#first model : simple neural network
model4 = Sequential()
embedding_layer2 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_len , trainable=False,mask_zero=True) #trainable set to False bc we use the downloaded dict
model4.add(embedding_layer2)
model4.add(Masking(mask_value=0.0)) #need masking layer to not train on padding (so for that words whicha weren't in the pretrained glove embeddings so their representation is full of zeros)
model4.add(LSTM(512,return_sequences=True))
model4.add(Dropout(0.3))
model4.add(LSTM(512,return_sequences=True))
model4.add(LSTM(265))
model4.add(Dense(64, activation='relu'))
model4.add(Dropout(0.5))
model4.add(Dense(1))
model4.add(Activation('sigmoid'))

model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model4.summary())

In [None]:
#tf.keras.utils.plot_model(model4, show_shapes=True)

In [None]:
EPOCHS=6
BATCH_SIZE=1024
model4.fit(X_train,y_train , batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.1)


In [None]:
#model4.save('/content/drive/MyDrive/model_4',save_format="h5")

**MODEL 5**

In [None]:

#MODEL 5
#vocab_length=max_features+1
#embedding_size=200
num_of_words=40 # train_sequences.shape[1]
embedding_size=200
#first model : simple neural network
model5 = Sequential()
embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=num_of_words , trainable=False,mask_zero=True) #trainable set to False bc we use the downloaded dict
model5.add(embedding_layer)
model5.add(Masking(mask_value=0.0)) #need masking layer to not train on padding (so for that words whicha weren't in the pretrained glove embeddings so their representation is full of zeros)
model5.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model5.add(MaxPooling1D(pool_size=2))
model5.add(LSTM(256))
model5.add(Dense(64, activation='relu'))
model5.add(Dropout(0.5))
model5.add(Dense(1))
model5.add(Activation('sigmoid'))

model5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model5.summary())

In [None]:
#tf.keras.utils.plot_model(model5, show_shapes=True)

In [None]:
EPOCHS=6
BATCH_SIZE=1024
model5.fit(X_train,y_train , batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.1)
#train the model

In [None]:
#model5.save('/content/drive/MyDrive/model_5',save_format="h5")

**MODEL 6**

In [None]:

#MODEL 6 

#vocab_length=max_features+1
embedding_size=200
num_of_words=40 # train_sequences.shape[1]

#first model : simple neural network
model6 = Sequential()
embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_len , trainable=False) #trainable set to False bc we use the downloaded dict
model6.add(embedding_layer)
model6.add(Bidirectional(LSTM(1024)))
model6.add(Dense(512, activation='relu'))
model6.add(Dropout(0.4))
model6.add(Dense(512, activation='relu'))
model6.add(Dropout(0.4))
model6.add(Dense(512, activation='relu'))
model6.add(Dropout(0.4))
model6.add(Dense(1))
model6.add(Activation('sigmoid'))

model6.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model6.summary())

In [None]:
#tf.keras.utils.plot_model(model6, show_shapes=True)

In [None]:
EPOCHS=6
BATCH_SIZE=1024

#train the model
model6.fit(X_train,y_train , batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.1)

In [None]:
#model6.save('/content/drive/MyDrive/model_6',save_format="h5")

In [None]:
#retrive the training predictions and testing predictions in order to combine these results using XGb classifier
train1 = model1.predict(X_train, batch_size=128)
test1 = model1.predict(X_test)

train2 = model2.predict(X_train, batch_size=128)
test2 = model2.predict(X_test)

train3 = model3.predict(X_train, batch_size=128)
test3 = model3.predict(X_test)

train4 = model4.predict(X_train, batch_size=128)
test4 = model4.predict(X_test)

train5 = model5.predict(X_train, batch_size=128)
test5 = model5.predict(X_test)

train6 = model6.predict(X_train, batch_size=128)
test6 = model6.predict(X_test)


In [None]:
#combine all the training predictions and testing predictions in order to train a XBGbooster for more accuracy
train = np.hstack((train1, train2, train3, train4, train5))
test = np.hstack((test1, test2, test3, test4, test5))

In [None]:

#train a XGB booster
import xgboost as xgb
model = xgb.XGBClassifier().fit(train, y_train)
y_pred = model.predict(test)
y_pred=[-1 if y_p<0.5 else 1 for y_p in y_pred ]


In [None]:
index=[]
for i in range(1,10001):
  index.append(i)
index=pd.DataFrame(index,columns=['Id'])
predictions=pd.DataFrame(y_pred,columns=["Prediction"])

In [None]:
predictions_final=pd.concat([index, predictions], ignore_index=False,axis=1)

In [None]:
predictions_final

In [None]:
predictions_final.to_csv('/submission_last.csv',index=False)