In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns


import re 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Load data 

In [2]:
train = pd.read_csv('./merged data/train.csv')
train.shape

(16200, 2)

In [3]:
test = pd.read_csv('./merged data/test.csv')
test.head()

Unnamed: 0,text,label
0,i am most certainly an acquired taste but late...,3
1,i only do unwillingly and always leaves me fee...,3
2,i feel this weekend is going to be a slutty one,2
3,i was feeling cold towards to my partner altho...,3
4,i found myself giggling and clapping my hands ...,1


In [4]:
val= pd.read_csv('./merged data/val.csv')
val.head()

Unnamed: 0,text,label
0,i can feel her pissed off attitude towards me ...,3
1,ive explained that he is very creative and lov...,1
2,i was feeling a bit nostalgic and typed all th...,2
3,i feel like our society has programmed little ...,1
4,i feel a little frantic because i know peoples...,4


### Data Preprocessing



1- clean any HTML tags in the text

In [12]:
def clean_html(text):
    
    clean = re.compile('<.*?>')
    return re.sub(clean, '',text)
    
train['text']=train['text'].apply(clean_html)
test['text']=test['text'].apply(clean_html)
val['text']=val['text'].apply(clean_html)

train.head()

Unnamed: 0,text,label
0,i am feeling humorous i put cold callers on hold,1
1,i feel sooooooooper vain taking pics of myself...,0
2,i still feel it does the genre a disservice wh...,1
3,i still didnt see a difference in the way my p...,0
4,i wont feel regretful,0


2- convert all the text into lower case 

In [13]:
def convert_lower(text):
    return text.lower()

train['text']=train['text'].apply(convert_lower)
test['text']=test['text'].apply(convert_lower)
val['text']=val['text'].apply(convert_lower)
train.head()

Unnamed: 0,text,label
0,i am feeling humorous i put cold callers on hold,1
1,i feel sooooooooper vain taking pics of myself...,0
2,i still feel it does the genre a disservice wh...,1
3,i still didnt see a difference in the way my p...,0
4,i wont feel regretful,0


3- clean the Tag sign and Tag name (ex:@Paula)

In [14]:
def cleaning_tags(text):
    return ' '.join(re.sub("([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)"," ", text).split())
train['text']=train['text'].apply(cleaning_tags)
test['text']=test['text'].apply(cleaning_tags)
val['text']=val['text'].apply(cleaning_tags)
train.head()

Unnamed: 0,text,label
0,i am feeling humorous i put cold callers on hold,1
1,i feel sooooooooper vain taking pics of myself...,0
2,i still feel it does the genre a disservice wh...,1
3,i still didnt see a difference in the way my p...,0
4,i wont feel regretful,0


4- clean all the punctuations  

In [15]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
train['text']=train['text'].apply(cleaning_punctuations)
test['text']=test['text'].apply(cleaning_punctuations)
val['text']=val['text'].apply(cleaning_punctuations)
train.head()

Unnamed: 0,text,label
0,i am feeling humorous i put cold callers on hold,1
1,i feel sooooooooper vain taking pics of myself...,0
2,i still feel it does the genre a disservice wh...,1
3,i still didnt see a difference in the way my p...,0
4,i wont feel regretful,0


In [16]:
##def cleaning_repeating_char(text):
 #   return re.sub(r'([a-z])\1+', r'\1', text)
#train['text']=train['text'].apply(cleaning_repeating_char)
#test['text']=test['text'].apply(cleaning_repeating_char)
#val['text']=val['text'].apply(cleaning_repeating_char)
#train.head()

5- clean the urls founded in the tweets


In [17]:
def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
train['text']=train['text'].apply(cleaning_URLs)
test['text']=test['text'].apply(cleaning_URLs)
val['text']=val['text'].apply(cleaning_URLs)
train.head()

Unnamed: 0,text,label
0,i am feeling humorous i put cold callers on hold,1
1,i feel sooooooooper vain taking pics of myself...,0
2,i still feel it does the genre a disservice wh...,1
3,i still didnt see a difference in the way my p...,0
4,i wont feel regretful,0


6- clean all numbers 

In [18]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
train['text']=train['text'].apply(cleaning_numbers)
test['text']=test['text'].apply(cleaning_numbers)
val['text']=val['text'].apply(cleaning_numbers)
train.head()

Unnamed: 0,text,label
0,i am feeling humorous i put cold callers on hold,1
1,i feel sooooooooper vain taking pics of myself...,0
2,i still feel it does the genre a disservice wh...,1
3,i still didnt see a difference in the way my p...,0
4,i wont feel regretful,0


7-remove stopwords from data

In [30]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test['text'] = test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

val['text'] = val['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

train.head()

[nltk_data] Downloading package stopwords to /home/paula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label,length
0,feeling humorous put cold callers hold,1,38
1,feel sooooooooper vain taking pics last hour,0,44
2,still feel genre disservice stories resolved a...,1,56
3,still didnt see difference way pores look didn...,0,95
4,wont feel regretful,0,19


8- stemming words in data

In [31]:
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
y=[]

def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z
train['text']=train['text'].apply(stem_words)
test['text']=test['text'].apply(stem_words)
val['text']=val['text'].apply(stem_words)
train.head()

Unnamed: 0,text,label,length
0,"[f, e, e, l, i, n, g, , h, u, m, o, r, o, u, ...",1,38
1,"[f, e, e, l, , s, o, o, o, o, o, o, o, o, p, ...",0,44
2,"[s, t, i, l, l, , f, e, e, l, , g, e, n, r, ...",1,56
3,"[s, t, i, l, l, , d, i, d, n, t, , s, e, e, ...",0,95
4,"[w, o, n, t, , f, e, e, l, , r, e, g, r, e, ...",0,19


9- join back after stemming

In [32]:
def joinback2(list_input):
    return "".join(list_input)
    


train['text']=train['text'].apply(joinback2)
test['text']=test['text'].apply(joinback2)
val['text']=val['text'].apply(joinback2)
train.head()

Unnamed: 0,text,label,length
0,feeling humorous put cold callers hold,1,38
1,feel sooooooooper vain taking pics last hour,0,44
2,still feel genre disservice stories resolved a...,1,56
3,still didnt see difference way pores look didn...,0,95
4,wont feel regretful,0,19


### Set vocublary size and find the max length of sentence in data set

In [33]:
vocab_size = 10000

In [34]:
train["length"] = [len(i) for i in train["text"]]


In [35]:
train["length"].max()
len_sentence = 150


### use one hot encoding to encode the data set into numerical features 

In [37]:
def text_prepare(data,coulmn) :
    one_hot_word = [one_hot(word, n=vocab_size) for word in data[coulmn]]
    embeddec_doc = pad_sequences(sequences=one_hot_word,
                              maxlen=len_sentence,
                              padding="pre")
    print(data.shape)
    return embeddec_doc

### split data to be prepared for model

In [38]:
x_train=text_prepare(train, "text")
x_validate=text_prepare(test, "text")
x_test=text_prepare(val, "text")

(16200, 3)
(2200, 2)
(2200, 2)


In [39]:
y_train=train["label"]
y_validate=test["label"]
y_test=val["label"]

### Enocde the target labels 

In [40]:
enc = OneHotEncoder()
y_train = np.array(y_train)
y_train = enc.fit_transform(y_train.reshape(-1,1)).toarray()
y_test = np.array(y_test)
y_validate = np.array(y_validate)

y_test = enc.fit_transform(y_test.reshape(-1,1)).toarray()
y_validate = enc.fit_transform(y_validate.reshape(-1,1)).toarray()

In [41]:
y_train.shape

(16200, 7)

In [42]:
x_train.shape

(16200, 150)

### Build the deep learning model 

1- build model structure 

In [43]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=150, input_length=len_sentence))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(64, activation="sigmoid"))
model.add(Dropout(0.2))
model.add(Dense(7, activation="softmax"))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


2- compile the model

In [44]:
model.compile(optimizer="Adam", loss = "categorical_crossentropy", metrics=["accuracy"])


3- prepare the early stopping 

In [47]:
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)
mc = ModelCheckpoint('./model.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)

4- fit the model

In [None]:
hist = model.fit(x_train, y_train, epochs = 25, batch_size = 64, validation_data=(x_validate, y_validate),verbose = 1, callbacks= [es, mc])

Train on 16200 samples, validate on 2200 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/25
Epoch 2/25