Import libraries....

In [None]:
import pandas as pd
from string import punctuation
from collections import Counter
from keras.utils import to_categorical
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.models import load_model


Read data, Remove empty data

In [None]:
df = pd.read_csv("sentiment_dataset_train.csv")
df.dropna()
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
df['rating'].value_counts()  # all classes are distributed almost equally.

Data Cleaning

Objective of <b>clean</b> function:
<br>- Create a new column of clean reviews so, it is easy to train</br>
<br>- Convert revires in lower case</br>
<br>- Remove special charectors</br>

<br>- There are very few reviews which contains emojis in text so, we do not need to remove it.</br>


In [None]:
def clean(Data_frame):
    Data_frame['Clean_data'] = np.nan*len(Data_frame['review'])     # Create an empty column for clean data
    for i in range(len(Data_frame['review'])):
        print(i)
        review = Data_frame['review'][i]
        #print(review)
        review_lower_case = review.lower()    # convert reviews into lower case
        remove_special_charactors = ''.join([c for c in review_lower_case if c not in punctuation])  # remove special charactor
        reviews_split = remove_special_charactors.split('\n')
        #filtered_sentence = ' '.join([word for word in str(reviews_split).split() if word not in stopwords.words("english")])
        print(reviews_split)
        #Data_frame.loc[i,['Clean_data']]=reviews_split  # adding clean review to our new column
        Data_frame['Clean_data'][i]=str(reviews_split)
    return Data_frame['Clean_data']

In [None]:
Reviews = clean(df)

In [None]:
df.head() # chack that new clean data column has addeded to our data

# Objective of prepare_data function:

<br>- Create vocabulary (Bag of words)</br>
<br>- Convert words into integers (Tokenize)</br>
<br>- Converts labels into categories with keras to_categorical function</br>
<br>- Identify the average length of revies so we can remove very short and big reviews.</br>
<br>- Convert clean reviews into integers and pad the reviews with average length of reviews.</br>
<br>- Split the created data into train and validation.</br>
<br>- Return ready to go data in LSTM</br>


In [None]:
def prepare_data(Reviews, Label, seq_length, train_test_split, training):
    
    #Reviews- New column of clean data
    #Label - Rating column of data
    #seq_length - average length of individual review
    #train_test_split - Fraction of data that we want to train and validate: i.e: 0.8  means 80% data in training and 20% in validation
    #training - True , if you want to use the function for training purpose.
    all_text = ' '.join(review for review in Reviews) 
    words = all_text.split()    
    count_words = Counter(words) 
    total_words = len(words)
    
    vocab_to_int = {w:i for i, w in enumerate(count_words)} # Create dictionary for words and convert them into integer
    print('length of vocab:', len(vocab_to_int))
    
    reviews_int = []
    for i in range(len(Reviews)):
        review = Reviews[i]
        r = [vocab_to_int[w] for w in review.split()]
        reviews_int.append(r)
        
    label = to_categorical(Label) #Convert labels into categories
    
    #plot
    reviews_len = [len(x) for x in reviews_int]
    pd.Series(reviews_len).hist()
    plt.legend('Average length of reviews')
    plt.show()
    pd.Series(reviews_len).describe()
    
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    #Padding of reviews
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
        
    Features = features
    if training == True:
    #Split the data into train and validation set
        split_frac =train_test_split
        train_x = Features[0:int(split_frac*len(Features))]
        train_y = label[0:int(split_frac*len(Features))]
        print('Total train reviews:', len(train_x))
        print('Total train labels:', len(train_y))
    
        valid_x = Features[int(split_frac*len(Features)):]
        valid_y = label[int(split_frac*len(Features)):]
        
    else:
        train_x = Features
        train_y = 0
        print('Total train reviews:', len(train_x))
        print(train_y)
        valid_x = 0
        valid_y = 0

    return train_x, train_y, valid_x, valid_y

In [None]:
train_x, train_y,  valid_x, valid_y =prepare_data(Reviews, df['rating'], 220, train_test_split=0.8, training=True)

![title](Structure-of-the-LSTM-cell-and-equations-that-describe-the-gates-of-an-LSTM-cell.png)

In [None]:
model1 = Sequential()
model1.add(layers.Embedding(58649, 64)) #The embedding layer
model1.add(layers.LSTM(20,dropout=0.2, return_sequences=True)) #Our LSTM layer
#model1.add(layers.LSTM(20,dropout=0.2, return_sequences=True))
model1.add(layers.LSTM(20,dropout=0.2))
model1.add(layers.Dense(6,activation='softmax'))

model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

#checkpoint1 = ModelCheckpoint("best_model1_1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
model1.fit(train_x, train_y, epochs=30,validation_data=(valid_x, valid_y),batch_size = 32)
model1.save('LSTM')

# Testing..

In [None]:
df_1 = pd.read_csv('sentiment_dataset_test.csv')  #load test data

clean_review = clean(df_1)  # clean test data


In [None]:
prepare_data =prepare_data(clean_review, Label=0, seq_length = 220, train_test_split =1, training=False)
#prepare test data

In [None]:
model = load_model('LSTM')  #load model

In [None]:
class_probabilities = model.predict(prepare_data[0])

In [None]:
df_1['Predicted_review'] = np.nan*len(df_1['review']) #predict review and save it in new column
for i in range(len(class_probabilities)):
    array = class_probabilities[i]
    prediction = np.argmax(array)
    print(prediction)
    df_1['Predicted_review'][i] = prediction

In [None]:
df_1.head() # check new dataframe

In [None]:
df_1.to_csv('test_result.csv') # save result as a csv file

## Ways to improve ML model

<b>Multiple algorithms</b><br>
<li>Hitting at the right machine learning algorithm is the ideal approach to achieve higher accuracy. But, it is easier said than done.</li>

<li>This intuition comes with experience and incessant practice. Some algorithms are better suited to a particular type of data sets than others. Hence, we should apply all relevant models and check the performance.</li>



<b>Algorithm Tuning</b><br>
<li>We know that machine learning algorithms are driven by parameters. These parameters majorly influence the outcome of learning process.

<li>The objective of parameter tuning is to find the optimum value for each parameter to improve the accuracy of the model. To tune these parameters, you must have a good understanding of these meaning and their individual impact on model. You can repeat this process with a number of well performing models.</li>

<li>For example: In random forest, we have various parameters like max_features, number_trees, random_state, oob_score and others. Intuitive optimization of these parameter values will result in better and more accurate models.</li>

## Deployment 

<b>The workflow can be broken down into following basic steps:</b>
<li>Training a machine learning model on a local system.</li>
<li>Wrapping the inference logic into a flask application.</li>
<li>Using docker to containerize the flask application.</li>
<li>Hosting the docker container on an AWS ec2 instance and consuming the web-service.</li>

<b>Training a machine learning model on a local system</b><br>
<li>The model file generated after training is stored as a pickle file which is a serialized format for storing objects. (In the repo, the file is named ‘trained_model.pkl’)</li>
<li>The inference call (.predict()) call requires 4 features per test sample in the form of a numpy array.</li>

<b>Wrapping the inference logic into a flask web service</b><br>
<li>Convert local code into functions and build RestAPI</li>

<b>Using docker to containerize the flask application.</b><br>
<li>Up to this point, we have a web-service that runs locally. Our ultimate intention is to be able to run this piece of code on a cloud virtual machine.</li>