# Improvement

### In this Section
- The idea in this code is to have equal ratio of positive and negative reviews in our dataset to address any minor class    imbalance.
- The original data has 72% of positive reviews and 28% of negative reviews.
- I have collected 8040 reviews and created a train set of 3000 reviews having 54% of positive reviews and 46% of negative reviews.
- I have also created a validation set of 1200 reviews to perform validation.


In [1]:
import pandas as pd
import numpy as np
import os
import numpy as np
import pandas as pd
import random
import string
random.seed(123)
import datetime as dt

# import warnings
# warnings.filterwarnings('ignore','RuntimeWarning')

import nltk
import re
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth', -1)

In [2]:
CONTRACTION_MAP = {"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "shan't": 'shall not', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd": 'she would', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have', "she's": 'she is', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so as', "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is', "there'd": 'there would', "there'd've": 'there would have', "there's": 'there is', "they'd": 'they would', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have', "to've": 'to have', "wasn't": 'was not', "we'd": 'we would', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is', "what've": 'what have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where's": 'where is', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is', "who've": 'who have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd": 'you would', "you'd've": 'you would have', "you'll": 'you will', "you'll've": 'you will have', "you're": 'you are', "you've": 'you have'}

In [3]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    #re.compile(regex).search(subject) is equivalent to re.search(regex, subject).
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction

    expanded_text = re.sub("’", "'", text)
    expanded_text = contractions_pattern.sub(expand_match, expanded_text)

    return expanded_text

In [4]:
# Function to Preprocess the Reviews
def clean_doc(doc):
    # Removing contractions
    doc = expand_contractions(doc)
    
    # split into tokens by white space
    tokens = doc.split(' ')
    
    # Converting into lower case
    tokens = [w.lower() for w in tokens]
    
    # remove special characters from each token
    tokens = [re.sub(r"[^a-zA-Z#\s]",'',i) for i in tokens]
    tokens = [re.sub(r"[\r\n]",'',i) for i in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # lemmatizing
    lmtzr = nltk.stem.WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(w) for w in tokens]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [5]:
# Dataset with 8040 reviews
data8000 = pd.read_csv('alreviews_df_8000.csv')

In [6]:
data8000.drop(['createDate', 'displayImageUrl', 'displayName', 'hasProfanity',
       'hasSpoilers', 'isSuperReviewer', 'isVerified', 'rating','timeFromCreation', 'updateDate'],axis=1,inplace=True)

In [7]:
data8000['sentiment'] = np.where((data8000['score']>3.0),0,1)

In [8]:
data8000['modified_review'] = data8000.review.apply(lambda x: ' '.join(clean_doc(x)))

In [9]:
# To check if any empty strings are created after pre processing
nulls = data8000[data8000.modified_review.apply(lambda x: len(x)) == 0].index

In [10]:
data8000.drop(nulls,inplace=True)

In [11]:
data8000['primary_key']= list(range(len(data8000)))

In [12]:
# Seperating positive and negative reviews
pos_reviews = data8000[data8000.sentiment == 0]
neg_reviews = data8000[data8000.sentiment == 1]

In [13]:
# Creating positive and negative reviews for training set
pos_reviews_train = pos_reviews[pos_reviews.sentiment==0].sample(n=1600)
neg_reviews_train = neg_reviews[neg_reviews.sentiment==1].sample(n=1400)

In [14]:
# We are seperating the reviews that are not present in reviews_train
pos_reviews_valid2 = pd.concat([pos_reviews,pos_reviews_train]).drop_duplicates(keep=False,subset='primary_key')
neg_reviews_valid2 = pd.concat([neg_reviews,neg_reviews_train]).drop_duplicates(keep=False,subset='primary_key')

In [15]:
# Creating positive and negative reviews for train set and joining them to create train set
data5050 = pd.concat([pos_reviews_train,neg_reviews_train],ignore_index=True,axis=0)
valid2_full = pd.concat([pos_reviews_valid2,neg_reviews_valid2],ignore_index=True,axis=0)

In [16]:
print(data5050.sentiment.value_counts())
print(valid2_full.sentiment.value_counts())

0    1600
1    1400
Name: sentiment, dtype: int64
0    4248
1    789 
Name: sentiment, dtype: int64


In [17]:
# Creating positive and negative reviews for validation set and joining them to create validation set
pos_reviews_valid = valid2_full[valid2_full.sentiment == 0].sample(n=800)
neg_reviews_valid = valid2_full[valid2_full.sentiment == 1].sample(n=400)
valid2 = pd.concat([pos_reviews_valid,neg_reviews_valid],ignore_index=True,axis=0)

In [18]:
print(valid2.sentiment.value_counts())

0    800
1    400
Name: sentiment, dtype: int64


In [20]:
# Making sure that there are no duplicate reviews in train and valid2 set and between them.
print(data5050.duplicated(subset='primary_key').sum())
print(valid2.duplicated(subset='primary_key').sum())
print(pd.concat([valid2,data5050],ignore_index=True,axis=0).duplicated(subset='primary_key').sum())

0
0
0


In [21]:
print(data5050.sentiment.value_counts())
print(valid2.sentiment.value_counts())

0    1600
1    1400
Name: sentiment, dtype: int64
0    800
1    400
Name: sentiment, dtype: int64


In [22]:
#Shuffleing the reviews
data5050 = data5050.sample(n=3000)
valid2 = valid2.sample(n=1200)

In [23]:
#data5050.to_csv('final_allreviews_5050.csv',index=False)
#valid2.to_csv('final_valid2.csv',index=False)

In [24]:
# data1=pd.read_csv('final_allreviews_5050.csv')
# valid2=pd.read_csv('final_valid2.csv')

In [25]:
test = pd.read_csv('test-1566381431512.csv')

In [26]:
test['modified_review'] = test.Review.apply(lambda x: ' '.join(clean_doc(x)))

In [27]:
test.sample()

Unnamed: 0,ReviewID,Review,modified_review
484,93360,Loved the music. Still a Great story. Animation was fantastic.,loved music still great story animation fantastic


In [28]:
X = data5050.modified_review
y = data5050.sentiment

X_train, X_valid1, y_train, y_valid1 = train_test_split(X,y,test_size=0.3, random_state=1234)

In [None]:
X_valid2 = valid2.modified_review
y_valid2 = valid2.sentiment
X_test = test.Review

In [None]:
max_num_words = 10000
seq_len = 50
embedding_size = 100

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(X_train) #Fit this to our corpus

X_train = tokenizer.texts_to_sequences(X_train) #'text to sequences converts the text to a list of indices
X_train = pad_sequences(X_train, maxlen=50) #pad_sequences makes every sequence a fixed size list by padding with 0s 
X_valid1 = tokenizer.texts_to_sequences(X_valid1) 
X_valid1 = pad_sequences(X_valid1, maxlen=50)
X_valid2 = tokenizer.texts_to_sequences(X_valid2) 
X_valid2 = pad_sequences(X_valid2, maxlen=50)
X_test = tokenizer.texts_to_sequences(X_test) 
X_test = pad_sequences(X_test, maxlen=50)

X_train.shape, X_valid1.shape, X_valid2.shape, X_test.shape # Check the dimensions of x_train and x_test  

In [None]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout, Conv1D, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def F1_score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# filepath="weights/cnn_weights-improvement-{epoch:02d}-{val_F1_score:.2f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, 
#                              monitor='val_F1_score', 
#                              verbose=1, 
#                              mode='max')

## 1D_CNN

In [None]:
model = Sequential()
model.add(Embedding(input_dim = max_num_words,input_length = seq_len,output_dim = embedding_size))
model.add(Conv1D(64, 3, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=[F1_score])
model.summary()

In [None]:
model.fit(X_train, y_train,
          batch_size=64,
          epochs=10,
          validation_data=(X_valid1, y_valid1),verbose=2)

In [None]:
train_classes = model.predict_classes(X_train)
valid1_classes = model.predict_classes(X_valid1)
valid2_classes = model.predict_classes(X_valid2)

In [None]:
print('Train F1 Score :',round(f1_score(y_train,train_classes),2))
print('Valid1 F1 Score :',round(f1_score(y_valid1,valid1_classes),2))
print('Valid2 F1 Score :',round(f1_score(y_valid2,valid2_classes),2))

    Past Results:
    Train F1 Score : 1.0
    Valid1 F1 Score : 0.79
    Valid2 F1 Score : 0.74

In [None]:
test_classes_cnn = model.predict_classes(X_test)

In [None]:
# submission = pd.read_csv('samplesubmission.csv')

# submission.sentiment = test_classes.astype('int64')

# submission.sentiment.dtype

In [None]:
#submission.to_csv('submission8.csv',index=False)
# Train F1 Score : 1.0
# Valid1 F1 Score : 0.79
# Valid2 F1 Score : 0.74
#test_score is 69

## GRU

In [None]:
from keras.layers import GRU, LSTM
from keras.optimizers import Adam

In [None]:
model_GRU=Sequential()
model_GRU.add(Embedding(max_num_words,100,mask_zero=True))
model_GRU.add(GRU(32,dropout=0.9,return_sequences=True))
model_GRU.add(GRU(16,dropout=0.9,return_sequences=False))
model_GRU.add(Dense(1,activation='sigmoid'))

model_GRU.summary()

In [None]:
model_GRU.compile(loss='binary_crossentropy',optimizer=Adam(lr = 0.001),metrics=[F1_score])
model_GRU.fit(X_train, y_train,batch_size=16,
                            epochs=12,
                            validation_data=(X_valid1, y_valid1),
                            verbose=2)

In [None]:
train_classes = model_GRU.predict_classes(X_train)
valid1_classes = model_GRU.predict_classes(X_valid1)
valid2_classes = model_GRU.predict_classes(X_valid2)

print('Train F1 Score :',round(f1_score(y_train,train_classes),2))
print('Valid1 F1 Score :',round(f1_score(y_valid1,valid1_classes),2))
print('Valid2 F1 Score :',round(f1_score(y_valid2,valid2_classes),2))

    Past Results:
    Train F1 Score : 0.93
    Valid1 F1 Score : 0.79
    Valid2 F1 Score : 0.75

In [None]:
test_classes_gru = model_GRU.predict_classes(X_test)

In [None]:
test1 = pd.read_csv('test-1566381431512.csv')

In [None]:
test1['sentiment_gru'] = test_classes_gru

In [None]:
test1['sentiment_cnn'] = test_classes_cnn

## LSTM

In [None]:
model_LSTM=Sequential()
model_LSTM.add(Embedding(max_num_words,100,mask_zero=True))
model_LSTM.add(LSTM(64,dropout=0.9,return_sequences=True))
model_LSTM.add(LSTM(64,dropout=0.9,return_sequences=False))
model_LSTM.add(Dense(1,activation='sigmoid'))
model_LSTM.summary()

In [None]:
model_LSTM.compile(loss='binary_crossentropy',optimizer=Adam(lr = 0.01, decay=0.001),metrics=[F1_score])
model_LSTM.fit(X_train, y_train,batch_size=32,
                            epochs=10,
                            validation_data=(X_valid1, y_valid1),
                            verbose=2)

In [None]:
train_classes = model_LSTM.predict_classes(X_train)
valid1_classes = model_LSTM.predict_classes(X_valid1)
valid2_classes = model_LSTM.predict_classes(X_valid2)

print('Train F1 Score :',round(f1_score(y_train,train_classes),2))
print('Valid1 F1 Score :',round(f1_score(y_valid1,valid1_classes),2))
print('Valid2 F1 Score :',round(f1_score(y_valid2,valid2_classes),2))

    Past Results:
    Train F1 Score : 0.95
    Valid1 F1 Score : 0.81
    Valid2 F1 Score : 0.75

In [None]:
test_classes_lstm = model_LSTM.predict_classes(X_test)

In [None]:
test1['sentiment_lstm'] = test_classes_lstm

In [None]:
test1.sentiment_cnn.value_counts()

In [None]:
test1.sentiment_gru.value_counts()

In [None]:
test1.sentiment_lstm.value_counts()

### Looking into the reviews where each model had different predictions

In [None]:
test1[~(((test1.sentiment_cnn ==1)&(test1.sentiment_gru ==1)&(test1.sentiment_lstm ==1))|
     ((test1.sentiment_cnn ==0)&(test1.sentiment_gru ==0)&(test1.sentiment_lstm ==0)))]

In [None]:
test1.drop(['ReviewID','Review'],inplace=True,axis=1)

In [None]:
test1['mode1']= test1.mode(axis=1)

In [None]:
test1.sample(5)

In [None]:
# submission = pd.read_csv('samplesubmission.csv')

# submission.sentiment = test1.mode1

# submission.sentiment.dtype

# submission.sentiment.value_counts()

# submission.to_csv('submission9.csv',index=False)

# Test score = 71

# ML Techniques

In this section we are trying ML models on unprocessed reviews

In [29]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90,max_features=1000,stop_words='english')

In [31]:
X = data5050.review
y = data5050.sentiment

In [32]:
X_train, X_valid1, y_train, y_valid1 = train_test_split(X,y,test_size=0.2, random_state=1234)

In [33]:
X_valid2 = valid2['review']
y_valid2 = valid2['sentiment']
X_test = test['Review']

In [34]:
print(X_train.shape)
print(y_train.shape)
print(X_valid1.shape)
print(y_valid1.shape)
print(X_valid2.shape)
print(y_valid2.shape)

(2400,)
(2400,)
(600,)
(600,)
(1200,)
(1200,)


In [35]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid1_tfidf = tfidf_vectorizer.transform(X_valid1)
X_valid2_tfidf = tfidf_vectorizer.transform(X_valid2)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [36]:
logreg = LogisticRegression(penalty='l2',C=1.25)
lr_clf = logreg.fit(X_train_tfidf,y_train)

train_pred = lr_clf.predict(X_train_tfidf)

valid1_pred = lr_clf.predict(X_valid1_tfidf)

valid2_pred = lr_clf.predict(X_valid2_tfidf)

print('Train F1 Score :',round(f1_score(y_train,train_pred),3))
print('Valid1 F1 Score :',round(f1_score(y_valid1,valid1_pred),3))
print('Valid2 F1 Score :',round(f1_score(y_valid2,valid2_pred),3))

Train F1 Score : 0.882
Valid1 F1 Score : 0.803
Valid2 F1 Score : 0.775


    Past Results:
    Train F1 Score : 0.882
    Valid1 F1 Score : 0.803
    Valid2 F1 Score : 0.775

In [None]:
classifier = MultinomialNB(alpha=0.1)
NB_clf = classifier.fit(X_train_tfidf,y_train)

train_pred = NB_clf.predict(X_train_tfidf)

valid1_pred = NB_clf.predict(X_valid1_tfidf)

valid2_pred = NB_clf.predict(X_valid2_tfidf)

print('Train F1 Score :',round(f1_score(y_train,train_pred),3))
print('Valid1 F1 Score :',round(f1_score(y_valid1,valid1_pred),3))
print('Valid2 F1 Score :',round(f1_score(y_valid2,valid2_pred),3))

In [None]:
#param_grid = {'C':[4,6,8],'gamma': [2,2.5,3]}

In [None]:
#svm_model_grid = GridSearchCV(SVC(kernel='rbf'),param_grid,n_jobs=-1,cv=5)

In [None]:
svm_classifier = SVC(kernel="rbf",C=25,gamma=2)
svm_clf = svm_classifier.fit(X_train_tfidf,y_train)
#svm_clf = svm_model_grid.fit(X_train_tfidf,y_train)


train_pred = svm_clf.predict(X_train_tfidf)

valid1_pred = svm_clf.predict(X_valid1_tfidf)

valid2_pred = svm_clf.predict(X_valid2_tfidf)

print('Train F1 Score :',round(f1_score(y_train,train_pred),3))
print('Valid1 F1 Score :',round(f1_score(y_valid1,valid1_pred),3))
print('Valid2 F1 Score :',round(f1_score(y_valid2,valid2_pred),3))

SVM REPORT
<br><br>
{'C': 10, 'gamma': 1}
Train F1 Score : 0.995
Valid1 F1 Score : 0.803
Valid2 F1 Score : 0.813
<br><br>
{'C': 6, 'gamma': 2}
Train F1 Score : 0.995
Valid1 F1 Score : 0.826
Valid2 F1 Score : 0.817
<br><br>
{'C': 4, 'gamma': 2}
Train F1 Score : 0.995
Valid1 F1 Score : 0.824
Valid2 F1 Score : 0.815
<br><br>
{'C': 10, 'gamma': 2}
Train F1 Score : 0.995
Valid1 F1 Score : 0.821
Valid2 F1 Score : 0.825
<br><br>
{'C': 6, 'gamma': 1.8}
Train F1 Score : 0.995
Valid1 F1 Score : 0.827
Valid2 F1 Score : 0.828
<br><br>