In [None]:
import pandas as pd
from string import punctuation
from collections import Counter
from keras.utils import to_categorical
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

In [None]:
df = pd.read_csv("sentiment_dataset_dev.csv") #read data
df.head()

Data Cleaning

In [None]:
def clean(Data_frame):
    Data_frame['Clean_data'] = ""     # Create an empty column for clean data
    for i in range(len(Data_frame['review'])):
        review = Data_frame['review'][i]
        #print(review)
        review_lower_case = review.lower()    # convert reviews into lower case
        remove_special_charactors = ''.join([c for c in review_lower_case if c not in punctuation])  # remove special charactor
        reviews_split = remove_special_charactors.split('\n')
        #filtered_sentence = ' '.join([word for word in str(reviews_split).split() if word not in stopwords.words("english")])
        #print(filtered_sentence)
        Data_frame.loc[i,['Clean_data']]=reviews_split  # adding clean review to our new column
        

In [None]:
clean(df)

In [None]:
df.head() # cheack data frame, a new clean data frame

Tokenize

In [None]:
all_text = ' '.join(review for review in df['Clean_data'])    # join all the clean reviews
print(all_text[:9000])

In [None]:
words = all_text.split()    
count_words = Counter(words)    # Count all the words using Counter Method
print("Total_words:", len(count_words))
print("count_words:", count_words)
total_words = len(words)


In [None]:
vocab_to_int = {w:i for i, w in enumerate(count_words)} # Create dictionary for words and convert them into integer

In [None]:
len(vocab_to_int)

In [None]:
# Convert our clean reviews into integer using the vocab_to_int dictionary
reviews_int = []
for i in range(len(df['Clean_data'])):
    review = df['Clean_data'][i]
    r = [vocab_to_int[w] for w in review.split()]
    reviews_int.append(r)
print (reviews_int[0:3])

In [None]:
label = to_categorical( df['rating']) #convert labels into categories

In [None]:
reviews_len = [len(x) for x in reviews_int] #find the average length of reviews
pd.Series(reviews_len).hist()
plt.show()
pd.Series(reviews_len).describe()

In [None]:
def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [None]:
Features = pad_features(reviews_int, 220)

In [None]:
split_frac = 0.9    # Train-test split
train_x = Features[0:int(split_frac*len(Features))]
train_y = label[0:int(split_frac*len(Features))]

valid_x = Features[int(split_frac*len(Features)):]
valid_y = label[int(split_frac*len(Features)):]


In [None]:
model1 = Sequential()
model1.add(layers.Embedding(len(vocab_to_int), 50)) #The embedding layer
model1.add(layers.LSTM(20,dropout=0.5, return_sequences=True)) #Our LSTM layer
model1.add(layers.LSTM(20,dropout=0.2))
model1.add(layers.Dense(6,activation='softmax'))


model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
model1.fit(train_x, train_y, epochs=50,validation_data=(valid_x, valid_y),callbacks=[checkpoint1],batch_size = 32)
model1.save('LSTM_1')

Purpose of this modal is to understand the workflow and suitability of the model, In LSTM.py this idea is developed in a very clean function which are very easy to use. Since test.csv has more data, model test accuracy for that file is 95.23% and validation accuracy is 72.35%