In [396]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer ,ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import re 

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional

In [397]:
train = pd.read_csv('./merged data/train.csv')
train.shape

(17500, 2)

In [398]:
test = pd.read_csv('./merged data/test.csv')
test.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0


In [399]:
val= pd.read_csv('./merged data/val.csv')
val.head()

Unnamed: 0,text,label
0,im feeling quite sad and sorry for myself but ...,0
1,i feel like i am still looking at a blank canv...,0
2,i feel like a faithful servant,2
3,i am just feeling cranky and blue,3
4,i can have for a treat or if i am feeling festive,1


# Prepare data for model 


In [400]:
import re
def clean_html(text):
    
    clean = re.compile('<.*?>')
    return re.sub(clean, '',text)
    
train['text']=train['text'].apply(clean_html)
test['text']=test['text'].apply(clean_html)
val['text']=val['text'].apply(clean_html)

train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [401]:
def convert_lower(text):
    return text.lower()

train['text']=train['text'].apply(convert_lower)
test['text']=test['text'].apply(convert_lower)
val['text']=val['text'].apply(convert_lower)
train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [402]:
import re
def cleaning_tags(text):
    return ' '.join(re.sub("([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)"," ", text).split())
train['text']=train['text'].apply(cleaning_tags)
test['text']=test['text'].apply(cleaning_tags)
val['text']=val['text'].apply(cleaning_tags)
train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [403]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
train['text']=train['text'].apply(cleaning_punctuations)
test['text']=test['text'].apply(cleaning_punctuations)
val['text']=val['text'].apply(cleaning_punctuations)
train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [404]:
def cleaning_repeating_char(text):
    return re.sub(r'([a-z])\1+', r'\1', text)
train['text']=train['text'].apply(cleaning_repeating_char)
test['text']=test['text'].apply(cleaning_repeating_char)
val['text']=val['text'].apply(cleaning_repeating_char)
train.head()

Unnamed: 0,text,label
0,i didnt fel humiliated,0
1,i can go from feling so hopeles to so damned h...,0
2,im grabing a minute to post i fel gredy wrong,3
3,i am ever feling nostalgic about the fireplace...,2
4,i am feling grouchy,3


In [405]:
def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
train['text']=train['text'].apply(cleaning_URLs)
test['text']=test['text'].apply(cleaning_URLs)
val['text']=val['text'].apply(cleaning_URLs)
train.head()

Unnamed: 0,text,label
0,i didnt fel humiliated,0
1,i can go from feling so hopeles to so damned h...,0
2,im grabing a minute to post i fel gredy wrong,3
3,i am ever feling nostalgic about the fireplace...,2
4,i am feling grouchy,3


In [406]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
train['text']=train['text'].apply(cleaning_numbers)
test['text']=test['text'].apply(cleaning_numbers)
val['text']=val['text'].apply(cleaning_numbers)
train.head()

Unnamed: 0,text,label
0,i didnt fel humiliated,0
1,i can go from feling so hopeles to so damned h...,0
2,im grabing a minute to post i fel gredy wrong,3
3,i am ever feling nostalgic about the fireplace...,2
4,i am feling grouchy,3


In [407]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test['text'] = test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

val['text'] = val['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

train.head()

[nltk_data] Downloading package stopwords to /home/paula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label
0,didnt fel humiliated,0
1,go feling hopeles damned hopeful around someon...,0
2,im grabing minute post fel gredy wrong,3
3,ever feling nostalgic fireplace wil know stil ...,2
4,feling grouchy,3


In [408]:
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
y=[]

def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z
train['text']=train['text'].apply(stem_words)
test['text']=test['text'].apply(stem_words)
val['text']=val['text'].apply(stem_words)
train.head()

Unnamed: 0,text,label
0,"[d, i, d, n, t, , f, e, l, , h, u, m, i, l, ...",0
1,"[g, o, , f, e, l, i, n, g, , h, o, p, e, l, ...",0
2,"[i, m, , g, r, a, b, i, n, g, , m, i, n, u, ...",3
3,"[e, v, e, r, , f, e, l, i, n, g, , n, o, s, ...",2
4,"[f, e, l, i, n, g, , g, r, o, u, c, h, y]",3


In [409]:
def joinback2(list_input):
    return "".join(list_input)
    


train['text']=train['text'].apply(joinback2)
test['text']=test['text'].apply(joinback2)
val['text']=val['text'].apply(joinback2)
train.head()

Unnamed: 0,text,label
0,didnt fel humiliated,0
1,go feling hopeles damned hopeful around someon...,0
2,im grabing minute post fel gredy wrong,3
3,ever feling nostalgic fireplace wil know stil ...,2
4,feling grouchy,3


In [420]:
vocab_size = 10000

In [421]:
train["length"] = [len(i) for i in train["text"]]


In [424]:
train["length"].max()
len_sentence = 150


In [425]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
def text_prepare(data,coulmn) :
    one_hot_word = [one_hot(word, n=vocab_size) for word in data[coulmn]]
    embeddec_doc = pad_sequences(sequences=one_hot_word,
                              maxlen=len_sentence,
                              padding="pre")
    print(data.shape)
    return embeddec_doc

In [426]:
x_train=text_prepare(train, "text")
x_validate=text_prepare(test, "text")
x_test=text_prepare(val, "text")

(17500, 3)
(3000, 2)
(2500, 2)


In [434]:
y_train=train["label"]
y_validate=test["label"]
y_test=val["label"]

In [436]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
y_train = np.array(y_train)
y_train = enc.fit_transform(y_train.reshape(-1,1)).toarray()
y_test = np.array(y_test)
y_validate = np.array(y_validate)

y_test = enc.fit_transform(y_test.reshape(-1,1)).toarray()
y_validate = enc.fit_transform(y_validate.reshape(-1,1)).toarray()

In [440]:
y_train.shape

(17500, 7)

In [438]:
x_train.shape

(17500, 150)

In [428]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam

In [441]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=150, input_length=len_sentence))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(64, activation="sigmoid"))
model.add(Dropout(0.2))
model.add(Dense(7, activation="softmax"))

In [442]:
model.compile(optimizer="Adam", loss = "categorical_crossentropy", metrics=["accuracy"])


In [443]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)
mc = ModelCheckpoint('./model.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)

In [444]:
hist = model.fit(x_train, y_train, epochs = 25, batch_size = 64, validation_data=(x_validate, y_validate),verbose = 1, callbacks= [es, mc])

Train on 17500 samples, validate on 3000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/25

KeyboardInterrupt: 

SyntaxError: invalid character in identifier (<ipython-input-445-a1ef3cb54e3b>, line 1)

In [391]:
vocab_size = 10000
train["length"].max()

222

In [392]:
len_sentence = 150


In [394]:
X_train.shape


(17500,)

In [395]:
X_train.head()

0                                 didnt fel humiliated
1    go feling hopeles damned hopeful around someon...
2               im grabing minute post fel gredy wrong
3    ever feling nostalgic fireplace wil know stil ...
4                                       feling grouchy
Name: text, dtype: object