Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import sqlite3 as sq
from bs4 import BeautifulSoup
from nltk.stem import SnowballStemmer
from tqdm import tqdm    # for progress bar
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
import gensim , gensim.downloader as api      #for downloading google w2v dataset
from gensim.models import word2vec
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.preprocessing import Normalizer, StandardScaler
from collections import Counter


ModuleNotFoundError: No module named 'imblearn'

Importing Dataset

In [None]:
# conn = sq.connect("/content/database.sqlite")

# # to identify object names present in SQLite DB
# cursor = conn.cursor()
# query  = "SELECT name from sqlite_master"
# cursor.execute(query)
# cursor.fetchall()

In [None]:
# data = pd.read_sql_query('SELECT * FROM REVIEWS', conn)
data = pd.read_csv('/content/drive/MyDrive/Amazon-Fine-Food-Reviews.csv')

print(data.head(3))
print(data.shape)

In [None]:
# conn.close()            #closing SQlite connection

In [None]:
#saving dataset in CSV format
# data.to_csv('Amazon-Fine-Food-Reviews.csv')

#568454 rows are present in dataset.
# checking count and no of distinct target outputs present in target column
data['Score'].value_counts()

In [None]:
count = data['Score'].value_counts()
type(count)

In [None]:
#Function for bar plot
def bar_plot(x,y,width,xlabel,ylabel,label,color):
    plt.bar(x,y,width=width,color=color,label=label)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()

In [None]:
bar_plot(count.keys(),count.values,0.25,'Score','Frequency','Frequency of each rating score','blue')

In [None]:
# considering score > 3 as positive sentiment , <3 as negative sentiment and =3 as neutral.

df = data[data['Score']!=3]  #Removing neutral reviews from the dataset
print(df['Score'].value_counts())
print(df.shape)


In [None]:
# % of data retained
print((df.shape[0]*100)/data.shape[0],"%")
val = df['Score'].value_counts()
bar_plot(val.keys(),val.values,0.25,'Score','Frequency','Frequency of each rating score','green')

In [None]:
df.head()

In [None]:
len(df['UserId'].unique())  # no of unique userids

In [None]:
# removing duplicates reviews from same user for a particular product
df_new = df.drop_duplicates(subset=['ProductId','UserId','Text','ProfileName'], keep='first')
df_new.shape

In [None]:
print('No of duplicate reviews = ', df.shape[0]-df_new.shape[0])

In [None]:
# Helpfulness Numerator : Number of Peoples who found the review helpful to them.
# Helpfulness Denominator : Number of Peoples indicated whether they found the review helpful or not.
# Since helpfulness Numerator can't be more than Helpfull Denominator , we will be removing such record
print(len(df_new[df_new['HelpfulnessNumerator']>df_new['HelpfulnessDenominator']]))

#only 2 such records are present
df_new = df_new[df_new['HelpfulnessNumerator']<=df_new['HelpfulnessDenominator']]
print(df_new.shape)

In [None]:
#checking for null values
print(df_new.isnull().any())

In [None]:
# mapping score>3 as positive review [1] and score <3 as negative review [0]
def re_score(x):
    if x<3:
        return 0
    return 1

df_new['Score'] = df_new['Score'].map(re_score)
print(df_new['Score'].value_counts())

In [None]:
count = df['Score'].value_counts()
bar_plot(count.keys(),count.values,0.1,'Score','Frequency','Frequency of each rating score','blue')

In [None]:
xwee = df_new['Text'].values


Positive reviews = 442867
Negative reviews = 81718

Our dataset is a imbalanced dataset
There are many methods of handling imbalanced dataset,we will be using SMOTE (an oversampling method that creates artificial instance id minority class by examining/selecting a random nearest neighbour by using KNN)

Other way is to use tree based models as they handle imbalanced data in a better way than non-tree based models
For metrics , we should use F1 score which depend on both precision and recall i.e. F1 score gets better when the amount and accuracy of predictions get better

Resampling

In [None]:
#For oversampling of dataset as we have an imbalance dataset

def resampling(train_data,test_data):
    smote = SMOTE(sampling_strategy='auto',random_state=42)
    train_data_resampled , test_data_resampled = smote.fit_resample(train_data,test_data)
    return train_data_resampled, test_data_resampled

Text Preprocessing

In [None]:
# Removing URLs from data
def preprocessing_url(text):
    text = re.sub(r'http\S+','',text)
    text = re.sub(r'www\S+','',text)
    return text


In [None]:
# Removing HTML content from data
def preprocessing_html(text):
    text = BeautifulSoup(text).get_text()
    return text

In [None]:
# Performing decontractions on data
def decontraction(text):
    text = re.sub(r"won't","will not",text)
    text = re.sub(r"can't","can not",text)
    text = re.sub(r"n't","not",text)
    text = re.sub(r"\'re","are",text)
    text = re.sub(r"\'s"," is",text)
    text = re.sub(r"\'d"," would",text)
    text = re.sub(r"\'ll"," will",text)
    text = re.sub(r"\'t"," not",text)
    text = re.sub(r"\'ve"," have",text)
    text = re.sub(r"\'m"," am",text)
    return text


In [None]:
import nltk
nltk.download('stopwords')

In [None]:
stop_words = stopwords.words('english')

In [None]:
# defining stopwords vocabulary
# stopwords = stopwords.words('english')
# print(type(stopwords))

# removing stopwords and lowering cases of words from dataset
def remove_stopwords(text):

    text = " ".join([i.lower() for i in text.split() if i not in stop_words])
    return text

In [None]:
# stemming words
stemmer = SnowballStemmer('english')

def stemming(text):
    text = " ".join([stemmer.stem(i) for i in text.split()])
    return text

In [None]:
#Single Function for text preprocessing
def text_preprocessing(df):
    arr = []
    for text in tqdm(df):
        text = preprocessing_url(text)
        text = preprocessing_html(text)
        text = decontraction(text)
        text = remove_stopwords(text)
        text = stemming(text)
        text = re.sub('\S*\d\S*',"",text)    # removes any substrings from the string text that contain at least one digit surrounded by non-digit characters.
        text = re.sub('[^A-Za-z0-9]+'," ",text) # replaces any sequence of characters in the string s that is not a letter or a digit with a single space character " ".
        arr.append(text)

    return arr

In [None]:
corpus = text_preprocessing(df_new['Text'].values)

In [None]:
abcd = corpus
dataset = df_new.copy()
dataset['Text'] = corpus
dataset.head(3)

In [None]:
df_new.shape

In [None]:
train_data , test_data = train_test_split(dataset,test_size=0.25, random_state=21)
print("size of training data:", len(train_data))
print("size of test data:", len(test_data))

In [None]:
# corpus = []

# for text in tqdm(df_new['Text'].values):
#     text = preprocessing_url(text)
#     text = preprocessing_html(text)
#     text = decontraction(text)
#     text = remove_stopwords(text)
#     text = stemming(text)
#     text = re.sub('\S*\d\S*',"",text)    # removes any substrings from the string text that contain at least one digit surrounded by non-digit characters.
#     text = re.sub('[^A-Za-z0-9]+'," ",text) # replaces any sequence of characters in the string s that is not a letter or a digit with a single space character " ".
#     corpus.append(text)



Vectorization

1. Bag of Words (BOW)

In [None]:
def bag_of_words(data):
    bow = CountVectorizer()
    return bow.fit(data)


2. TF-IDF

In [None]:
def tf_idf(data):
    tfidf = TfidfVectorizer()
    return tfidf.fit(data)


3. Average Word2Vec

In [None]:
def avg_w2v(arr):
    ds = []
    for word in arr:
        ds.append(word.split())

    model = Word2Vec(ds,vector_size=150,window=25,min_count=2)

    avg_list = []
    for i in tqdm(data):
        vec= np.zeros(150)
        count = 0
        for j in i.split():
            try:
                vec+=model.wv[j]
                count+=1
            except:
                pass
        if count!=0:
            vec = vec/count
            avg_list.append(vec)
        else:
            avg_list.append(np.zeros(150))


    print("Total no of vectors:",len(avg_list))     #length of total no of vector
    print("Dimension of vector:",len(avg_list[1])) #length of avg vector

    return np.array(avg_list)


# for word2vec , data is required as list of lists  <--------------------------- IMP
# more the data, better is the performance
# http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.XR0cft9fiXJ
# size is number of dimensions of vector that you want to build
# window is the maximum distance between two similar words
# min_count is the minimum number of times the word must appear in corpus
# workers is the numbers of cpu that you want to use

In [None]:
# w = "hello my name is Abhishek"
# print(w.split())

In [None]:
def normalisation(data):
    return Normalizer().fit_transform(data)

def standardisation(data):
    return StandardScaler().fit_transform(data)

Building our Logistic Regression model

In [None]:
model_lr = LogisticRegression(max_iter=1000)

Training on BOW model

In [None]:
bow = bag_of_words(train_data['Text'])
train_data_bow = bow.transform(train_data['Text'])

In [None]:
print("Classification distribution before SMOTE", Counter(train_data['Score']))

In [None]:
# text data needs to be vectorized before USING smote
#SMOTENC takes categorical data

smote = SMOTE(sampling_strategy='auto',random_state=42)
X_train_resampled , y_train_resampled = smote.fit_resample(train_data_bow,train_data['Score'])

In [None]:
print('Classification distribtuion after SME', Counter(y_train_resampled))

In [None]:
model_lr.fit(X_train_resampled,y_train_resampled)

In [None]:
test_data_bow = bow.transform(test_data['Text'])

In [None]:
y_pred = model_lr.predict(test_data_bow)
score = accuracy_score(y_pred,test_data['Score']) *100

print("Accuracy acheived by Logistic Regression using bow =",score,"%")


In [None]:
X_train_resampled_n = normalisation(X_train_resampled)
test_data_bow_n = normalisation(test_data_bow)
model_lr.fit(X_train_resampled_n,y_train_resampled)
y_pred = model_lr.predict(test_data_bow_n)
score = accuracy_score(y_pred,test_data['Score']) *100
print("Accuracy acheived by Logistic Regression using bow after normalisation =",score,"%")


Training on TF-IDF model

In [None]:
tfidf=tf_idf(train_data['Text'])
train_data_tfidf = tfidf.transform(train_data['Text'])

X_train_resampled, y_train_resampled = smote.fit_resample(train_data_tfidf, train_data['Score'])
model_lr.fit(X_train_resampled, y_train_resampled)

test_data_tfidf = tfidf.transform(test_data['Text'])
y_pred = model_lr.predict(test_data_tfidf)

score = accuracy_score(y_pred, test_data['Score']) *100
print("Accuracy acheived by Logistic Regression using TF-IDF =",score,"%")


Training on w2v model