# Hackathon Sentiment Analysis
    

In [74]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

stop_words = stopwords.words('english')

## Cleaning and preprocessing of the reviews


In [75]:
dfTrain = pd.read_csv('data/train.csv', sep=",", encoding="utf-8", quotechar='"')
dfTrain = dfTrain[~dfTrain['ReviewText'].isnull()]

Wall time: 11 s


In [None]:
dfTrain['ReviewText'] = dfTrain['ReviewText'].apply(lambda x: str(x).lower())\
    .apply(lambda x: x.translate(x.maketrans('','', string.punctuation)))

In [76]:
df_train = dfTrain[dfTrain['Rating'] == 1].sample(n = 50000, random_state = 15)
df_train = df_train.append(dfTrain[dfTrain['Rating'] == 2].sample(n = 50000, random_state = 15))
df_train = df_train.append(dfTrain[dfTrain['Rating'] == 3].sample(n = 50000, random_state = 15))
df_train = df_train.append(dfTrain[dfTrain['Rating'] == 4].sample(n = 50000, random_state = 15))
df_train = df_train.append(dfTrain[dfTrain['Rating'] == 5].sample(n = 50000, random_state = 15))
df_train = df_train.copy()


dftest = dfTrain.drop(labels = df_train.index)

term frequency-inverse document frequency (tf-idf) vectorizer parameters and then convert the review list into a tf-idf matrix.

To get a Tf-idf matrix, first count word occurrences by reviewa. This is transformed into a document-term matrix (dtm). This is also just called a term frequency matrix.

Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a document but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the document.

In [77]:
df_validation_set1 = pd.read_csv('test1_generic_reviews.csv', sep=",", encoding="utf-8", quotechar='"', index_col=0)
df_validation_set2 = pd.read_csv('testB_dell_reviews.csv', sep=",", encoding="utf-8", quotechar='"', index_col=0)

In [79]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
from keras.utils import to_categorical
import random
from tensorflow import set_random_seed
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
lemmatizer = WordNetLemmatizer()

In [80]:
df_train['ReviewText'] = df_train['ReviewText'].str.split().apply(lambda x: [word for word in x if word not in stop_words])
df_train['ReviewText'] = df_train['ReviewText'].apply(lambda x: ' '.join(x))

In [81]:
df_train.head()

Unnamed: 0,ReviewText,Rating
239969,eforcity laptop chill pad dual fan notebook co...,1
267703,5 months drive trash connector part flimsy sna...,1
343683,tow cheap things work well enough low quality ...,1
100239,got one wife worked week builtin software woul...,1
575687,13 years experience manager first found produc...,1


In [82]:
tfidf = TfidfVectorizer()
# full_tf = tf.fit_transform(df_full['ReviewText'])
dataf = tfidf.fit(df_train['ReviewText'])

Wall time: 16.9 s


In [None]:
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['ReviewText']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(df_full)
test_sentences = clean_sentences(df_test)
print(len(train_sentences))
print(len(test_sentences))

In [83]:
%%time
X_traindata = dataf.transform(df_train['ReviewText'])
X_test = dataf.transform(dftest['ReviewText'])
y_train = df_train['Rating']
y_test = dftest['Rating']

Wall time: 2min 18s


## Train the models

In [84]:
model = MultinomialNB()
model.fit(X_traindata, y_train.astype('int'))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
train_predict = model.predict(X_traindata)
test_predict = model.predict(X_test)

print(f"Accuracy for train: {metrics.accuracy_score(y_train, train_predict)}")
print(f"Accuracy for text: {metrics.accuracy_score(y_test, test_predict)}")

Accuracy for train: 0.595888
Accuracy for test: 0.5425340442628187


## Sentiments on Test results

In [88]:
%%time
df_validation_set1 = pd.read_csv('test1_generic_reviews.csv', sep=",", encoding="utf-8", quotechar='"', index_col=0)
df_validation_set2 = pd.read_csv('testB_dell_reviews.csv', sep=",", encoding="utf-8", quotechar='"', index_col=0)

Wall time: 82 ms


Replace null values with empty string, to garante test set is consistent with our model

In [89]:
df_validation_set1['ReviewText'].fillna('', inplace=True)
df_validation_set2['ReviewText'].fillna('', inplace=True)

apply tf_idf for both dataset

In [90]:
X_validation1 = dataf.transform(df_validation_set1['ReviewText'])
X_validation2 = dataf.transform(df_validation_set2['ReviewText'])

predict for both datasets

In [91]:
predicted1 = model.predict(X_validation1)
predicted2 = model.predict(X_validation2)

write our results

In [92]:
pd_validation1 = pd.DataFrame({'ReviewText': df_validation_set1['ReviewText'], 'PredictedRating': predicted1})
pd_validation1.to_csv('validation7.csv', sep=",", encoding="utf-8", quotechar='"')

pd_validation2 = pd.DataFrame({'ReviewText': df_validation_set2['ReviewText'], 'PredictedRating': predicted2})
pd_validation2.to_csv('validation8.csv', sep=",", encoding="utf-8", quotechar='"')