# 1. IMPORTING LIBRAIRIES

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV

# 2. LOADING DATA

In [6]:
df = pd.read_csv('Tweets.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


# 3. CLEAN THE DATA

In [8]:
df.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [9]:
df.dropna(inplace=True)

# 4. GENERATE X VALUES BY SENTENCE VECTOR & TEXT NORMALIZATION

In [10]:
#try text or selected_text
#it works better for 'selected_text' column
feature_name = 'selected_text'

## A. CLEAN THE SENTENCE

In [11]:
def test_insult(w):
    """ test if the word w is potentially an insult. We look if '***' until '*********' is in w """
    insult = ['***']
    if insult[0] in w:
        return True
    for i in range(1, 7):
        insult.append(insult[i-1]+'*')
        if insult[i] in w:
            return True
    return False 

In [12]:
def clean_words(w):
    """ remove special character, put lower, put 'insult' """
    """ if you use drop_char, it will also remove single character """
    if test_insult(w):
        return 'insult'
    
    w = w.lower()
    new_w = ""
    i = 0
    
    while i < len(w):
        if ord(w[i]) >= 97 and ord(w[i]) <= 122:
            new_w = new_w + w[i]
            
        else:
            new_w = new_w + " "
            
        i += 1
        
    return new_w

In [13]:
def clean_sentence(s) :
    """ clean each word of the sentence """
    lwords = s.split()
    for i in range(0, len(lwords)) :
        w = clean_words(lwords[i])
        #if w == 'insult': print(lwords[i]) , check what it is considered like an insult, it's not perfect
        lwords[i] = w
       
    
    space = " "
    s = space.join(lwords)
  
    if s == '' or s == ' ':
        return None
    
    return space.join(lwords)

In [14]:
#clean all sentence of the fetaure column and create a new column with the clean values
df['filtered_text'] = df[feature_name].apply(clean_sentence)

In [15]:
df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
filtered_text    1
dtype: int64

In [16]:
df.dropna(inplace=True)

In [17]:
df

Unnamed: 0,textID,text,selected_text,sentiment,filtered_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,i d have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of insult
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,d lost
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,don t force
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good for both of you
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,but it was worth it insult


## B. SENTENCE VECTOR WITH TOKENIZER

In [18]:
ltext = list(df['filtered_text'])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(ltext)
sequences = tokenizer.texts_to_sequences(ltext)
vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = max(len(sequence) for sequence in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# 4. CREATE SENTIMENT'S VECTOR COLUMN, Y

In [19]:
sentiments = df['sentiment'].values

In [20]:
sentiment_dict = {'positive': 0, 'negative': 1, 'neutral': 2}
Vsentiments = np.array([sentiment_dict[s] for s in sentiments])
Y = pd.get_dummies(Vsentiments).values

In [21]:
Y

array([[0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       ...,
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1]], dtype=uint8)

# 5. FIRST MODEL FITTING AND PREDICTING 

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2 , random_state=42) 

In [23]:
#lstm model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
#batch_size depends of your GPU, CPU:
batch_size = 32 
#increase epochs increase time execution !!
epochs = 5

In [26]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x18871de3fd0>

In [27]:
first_loss, first_accuracy = model.evaluate(X_test, y_test)
print("first loss : {:.2f}%, first accuracy : {:.2f}%". format(first_loss*100,first_accuracy*100))

first loss : 53.30%, first accuracy : 82.84%


# 6. MODEL OPTIMIZATION

## A. FIND THE BEST PARAMETERS

In [28]:
#minimize size of train value to get time
k = 12
small_X_train = X_train[0:len(X_train)//k]
small_y_train = y_train[0:len(y_train)//k]

In [29]:
def create_lstm_model(hidden_nodes=128, dropout_rate=0.2, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
    model.add(LSTM(hidden_nodes, dropout=dropout_rate, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [30]:
dropout_rate = [0.2, 0.3, 0.5]   
hidden_nodes = [32, 64, 128]  
optimizer = ['adam']
batch_size = [32,16,64]  
epochs = [1,2] #take a lot of time if you add or increase values

In [31]:
model = KerasClassifier(model=create_lstm_model, batch_size=batch_size, verbose=0, dropout_rate=dropout_rate, hidden_nodes = hidden_nodes,optimizer=optimizer, epochs=epochs)
param_grid = {
    'hidden_nodes': hidden_nodes, 
    'dropout_rate': dropout_rate, 
    'optimizer': optimizer,
    'batch_size': batch_size, 
    'epochs': epochs
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

In [32]:
grid

In [33]:
grid_result = grid.fit(small_X_train,small_y_train)

In [34]:
print("Best parameters: ", grid_result.best_params_)
print("Best score: ", grid_result.best_score_)

Best parameters:  {'batch_size': 16, 'dropout_rate': 0.2, 'epochs': 2, 'hidden_nodes': 128, 'optimizer': 'adam'}
Best score:  0.7132650049636448


## B. FINAL MODEL

In [35]:
#best parameters
dropout_rate = grid_result.best_params_['dropout_rate']
hidden_nodes = grid_result.best_params_['hidden_nodes']
optimizer = grid_result.best_params_['optimizer']
batch_size = grid_result.best_params_['batch_size']
epochs = 2

In [36]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(LSTM(hidden_nodes, dropout=dropout_rate, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

In [37]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [38]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1880197e130>

In [39]:
final_loss, final_accuracy = model.evaluate(X_test, y_test)



In [40]:
print("final loss: {:.2f}%, final accuracy : {:.2f}%". format(final_loss*100,final_accuracy*100))
print("optimization gain : {:.2f}%". format(100*(final_accuracy-first_accuracy)))

final loss: 43.12%, final accuracy : 83.82%
optimization gain : 0.98%


# 7. CONCLUSION

In [41]:
tok_ = tokenizer 
def predict(s):
    """ this function is a final function to predict the sentiment of a sentence
        but it will only use our previus vocabulary data and don't update it"""
    sentiment = ['positive', 'negative', 'neutral']
    #clean the sentence and create the sequence
    new_s = clean_sentence(s)
    if s == None:
        return "empty sentence" 
    sequence = tok_.texts_to_sequences([new_s])
    x = pad_sequences(sequence, maxlen=max_sequence_length)
    
    #prediction
    prob_ = model.predict(x)
    pos = np.argmax(prob_)
    
    #result
    print("'{}' seems to be {}.". format(s, sentiment[pos]))   

In [46]:
#some test
predict("I love eat")  
predict("I hate you")  
predict("we are sad")
predict("I have to go")
predict("I really loved the movie, it was fantastic!")

'I love eat' seems to be positive.
'I hate you' seems to be negative.
'we are sad' seems to be negative.
'I have to go' seems to be neutral.
'I really loved the movie, it was fantastic!' seems to be positive.
