In [2]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
# nltk.download('punkt')

In [3]:
train_df = pd.read_json('train.json')
print(train_df.shape)
train_df.head()

(7401, 2)


Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


In [4]:
train_df['sentiments'].value_counts()


1    6319
0    1082
Name: sentiments, dtype: int64

In [5]:
# clean reviews
# Normalization - lower cased, remove stopwords, punctuations, special chars, numbers 
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

stop = stopwords.words('english')
train_df['cleaned_reviews'] = train_df['reviews'].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])).apply(remove_punctuations).map(lambda x: re.sub(r'\W+\d+', '', x))

In [6]:
# train_df['cleaned_reviews'].to_csv('cleaned_reviews', encoding='utf-8', index=False)

In [7]:
# stemming
# ps = PorterStemmer()
# def stem_words(text):
#     return ' '.join([ps.stem(word) for word in text.split()])
# train_df['stemmed_reviews'] = train_df['cleaned_reviews'].apply(stem_words)

In [8]:
# lemmatizer - eg. loved => love, loving => love, lovely => love
lemmatizer = WordNetLemmatizer()
def lemmatize_review(text_review):
    empty = []
    #loop for every tokenized word & associated tag in list
    for word, tag in pos_tag(word_tokenize(text_review)):
        #get first letter of tag in lower-case
        word_tag = tag[0].lower()
        word_tag = word_tag if word_tag in ['a', 'r', 'n', 'v'] else None
        if not word_tag:
            empty.append(word)
        else:
            # lemmatize word if word is an adjective(a), adverb(r), noun(n), or verb(v)
            lemma = lemmatizer.lemmatize(word, word_tag)
            empty.append(lemma)
    return ' '.join(empty)
    

In [9]:
train_df['lemmatized_reviews'] = train_df['cleaned_reviews'].apply(lemmatize_review)

In [10]:
from sklearn.model_selection import train_test_split
# 80% training, 20% testing
X_train, X_test, Y_train, Y_test = train_test_split(train_df['lemmatized_reviews'], train_df['sentiments'], test_size=0.20, random_state=50, stratify=train_df['sentiments'])

In [11]:
# warnings may appear as custom tokenizer is used
# max_df=0.9 - ignore words that appear in more than 90% of the documents
# min_df=5 - ignore words that appear in 5 or less documents, also to filter misspelled words
# max_features=2000 - get top 2000 words by freq
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, max_df=0.9, min_df=5, max_features=2000)
vectorizer = vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.shape



(5920, 2000)

In [12]:
# key tuning parameters: C, penalty, solver
# C minimises model to overfit training data, inverse of regularization strength
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# how C is determined using GridSearchCV (cross-validation method)
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}
log_reg_model = LogisticRegression(max_iter=50000)
cv = GridSearchCV(log_reg_model, parameters)
cv.fit(X_train_vectorized, Y_train)
cv.best_params_

{'C': 10}

In [13]:
from sklearn.linear_model import LogisticRegression
# A higher value of C tells the model to give more weight to the training data
# When we increase value of C the importance of finding proper decision boundary is increased
# and will give less importance weight values.
# C=1 by default
model = LogisticRegression(C=10)
model.fit(X_train_vectorized, Y_train)

LogisticRegression(C=10)

In [14]:
X_test_vectorized = vectorizer.transform(X_test)
model_predictions = model.predict(X_test_vectorized)

In [15]:
#A model whose predictions are 100% wrong has an AUC of 0.0
# one whose predictions are 100% correct has an AUC of 1.0
from sklearn.metrics import roc_auc_score
print('AUC score:', roc_auc_score(Y_test, model_predictions))

AUC score: 0.7433591699235841


In [16]:
#accuracy = correct_predictions / total_predictions
from sklearn.metrics import accuracy_score
print('Accuracy score:', accuracy_score(Y_test, model_predictions))

Accuracy score: 0.900742741390952


In [21]:
# calcualate logistic loss
from sklearn.metrics import log_loss
log_loss(Y_test, model.predict_proba(X_test_vectorized))


0.24553587430757468

In [None]:
feature_names = np.array(vectorizer.get_feature_names_out())
sorted_coefficient_index = model.coef_[0].argsort()

In [None]:
print("Smallest coefficient", feature_names[sorted_coefficient_index[:10]])

Smallest coefficient ['poor' 'disappointed' 'return' 'bother' 'cheap' 'worst' 'terrible'
 'disappoint' 'elsewhere' 'portion']


In [None]:
print("Largest coefficient", feature_names[sorted_coefficient_index[:-11:-1]])

Largest coefficient ['great' 'love' 'comfortable' 'perfect' 'excellent' 'happy' 'easy' 'best'
 'soft' 'pleased']


In [None]:
test_df = pd.read_json('test.json')
test_df.head()
print(test_df.shape)

(1851, 1)


In [None]:
X_test = vectorizer.transform(test_df['reviews'])
print(X_test.shape)

(1851, 2000)


In [None]:
predictions = model.predict(X_test)
print(predictions)

[0 1 1 ... 1 1 1]


In [None]:
submission = pd.DataFrame({
     "Review": list(test_df['reviews']),
     "Sentiments":list(predictions),
})
submission.to_csv("submission.csv", index=False)

In [None]:
# .str.replace('[{}]'.format(string.punctuation), '', regex=True)
#train_df['tokenized_reviews'].to_csv('tokenized_reviews', encoding='utf-8', index=False)