## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import string

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /Users/nnerella/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nnerella/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nnerella/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score

## Import Dataset

In [5]:
reviews_csv = pd.read_csv('../../data/raw/reviews.csv')

In [6]:
reviews_csv.head()

Unnamed: 0,Sentiment,Time,Text
0,positive,18/6/21,This is a very healthy dog food. Good for thei...
1,positive,7/7/21,I've been very pleased with the Natural Balanc...
2,positive,18/6/21,"Before I was educated about feline nutrition, ..."
3,positive,7/7/21,"My holistic vet recommended this, along with a..."
4,positive,1/7/21,I bought this coffee because its much cheaper ...


In [7]:
reviews_csv.isnull().sum()

Sentiment    0
Time         0
Text         0
dtype: int64

In [8]:
reviews_csv.shape

(5444, 3)

In [9]:
reviews_csv['Sentiment'].value_counts()

positive    4030
negative    1414
Name: Sentiment, dtype: int64

In [10]:
reviews_csv['length'] = reviews_csv['Text'].apply(lambda x: len(x))
reviews_csv.head()

Unnamed: 0,Sentiment,Time,Text,length
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,137
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,350
2,positive,18/6/21,"Before I was educated about feline nutrition, ...",733
3,positive,7/7/21,"My holistic vet recommended this, along with a...",493
4,positive,1/7/21,I bought this coffee because its much cheaper ...,413


## Pre-processing

In [11]:
# Function to convert text to lowercase
def convert_to_lower(text):
    return text.lower()

# Removes html notation, such as <br/>
def remove_html(text):
    regex = r"<[^>]+>"
    text_new = re.sub(regex, " ", text)
    return text_new

# Function to remove numbers and other numeric values
def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Function to remove stopwords
def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

# Function to remove extra white spaces
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

# Function to Lemmatize
def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)


In [12]:
# pp_reviews is the new processed reviews 
pp_reviews = pd.DataFrame()
pp_reviews['Text'] = reviews_csv['Text'].apply(lambda x: convert_to_lower(x))
pp_reviews['Text'] = pp_reviews['Text'].apply(lambda x: remove_html(x))
pp_reviews['Text'] = pp_reviews['Text'].apply(lambda x: remove_numbers(x))
pp_reviews['Text'] = pp_reviews['Text'].apply(lambda x: remove_punctuation(x))
pp_reviews['Text'] = pp_reviews['Text'].apply(lambda x: remove_stopwords(x))
pp_reviews['Text'] = pp_reviews['Text'].apply(lambda x: remove_extra_white_spaces(x))
pp_reviews['Text'] = pp_reviews['Text'].apply(lambda x: lemmatizing(x))

In [13]:
pp_reviews['after_length'] = pp_reviews['Text'].apply(lambda x: len(x))
pp_reviews['before_length'] = reviews_csv['length']
pp_reviews

Unnamed: 0,Text,after_length,before_length
0,healthy dog food good digestion also good smal...,92,137
1,ive pleased natural balance dog food dog issue...,223,350
2,educated feline nutrition allowed cat become a...,496,733
3,holistic vet recommended along brand tried cat...,276,493
4,bought coffee much cheaper ganocafe organic re...,223,413
...,...,...,...
5439,okay gift box like mediocre cheese summer saus...,154,275
5440,look llike walked raw deal item intolerably st...,975,1545
5441,thank god tasted metal swallowed dont even get...,72,139
5442,product good began buying lately terrible tast...,87,163


In [14]:
# Convert positive label -> 1 and negative label -> 0
label_map = {
    'positive': 1,
    'negative': 0,
}
pp_reviews['sentiment_label'] = reviews_csv['Sentiment'].map(label_map)
pp_reviews.head()

Unnamed: 0,Text,after_length,before_length,sentiment_label
0,healthy dog food good digestion also good smal...,92,137,1
1,ive pleased natural balance dog food dog issue...,223,350,1
2,educated feline nutrition allowed cat become a...,496,733,1
3,holistic vet recommended along brand tried cat...,276,493,1
4,bought coffee much cheaper ganocafe organic re...,223,413,1


## Text Vectorization using TF-IDF

In [15]:
# Use TfidfVectorizer to create a sparse matrix - numerical representation of the text strings 
tf_idf = TfidfVectorizer()
tf_idf_matrix = tf_idf.fit_transform(pp_reviews['Text'])

In [16]:
# Convert matrix to array
tf_idf_array = tf_idf_matrix.toarray()
print(tf_idf_array)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
tf_idf_array.shape

(5444, 12603)

## Train-Test Split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_array, pp_reviews['sentiment_label'].values, test_size=0.3)

## Model 1: Naive Baye's Classifier

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

NB = GaussianNB()
NB.fit(X_train, y_train)
NB_pred= NB.predict(X_test)
#print(NB_pred)

print(accuracy_score(y_test, NB_pred))

0.5624235006119951


In [20]:
f1_score(y_test,  NB_pred)

0.6483030004918838

In [21]:
pp_reviews['Text'][279]

'recently fell love indian food papads quickly toasted hot oven healthy alternative potatoe chip excellent alone dipped chip dip thin bread cooked brittle crisp like potatoe chip however baked fried made chickpea flour little healthy potatoe chip'

In [22]:
def remove_html2(text):
    regex = r"<[^>]+>"
    text_new = re.sub(regex, "", text)
    return text_new

remove_html2(pp_reviews['Text'][279])

'recently fell love indian food papads quickly toasted hot oven healthy alternative potatoe chip excellent alone dipped chip dip thin bread cooked brittle crisp like potatoe chip however baked fried made chickpea flour little healthy potatoe chip'

In [23]:
pp_reviews['Text'][327]

'length min expiration date month day bought product tuna tomato combination delicious one many item reorder amazon every month edit added short video see combination look like wearing one headlamp didnt good job focusing tuna film doesnt look like many people view product anyway'

In [24]:
text = "This is some <example> text <with> angular <brackets>."
remove_html2(text)

'This is some  text  angular .'

In [25]:
reviews_csv['Text'][327]

'<span class="tiny"> Length:: 0:26 Mins<br /><br /></span>The expiration date is 21 months from the day I bought this product. The tuna, tomato combination is delicious. This is one of the many items I re-order on Amazon every month or so.<br />edit: added a short video, so you see what this combination looks like.<br /><br />I was wearing one of those headlamps, but I didn\'t do a good job focusing on the tuna.<br />I should film it over, but it doesn\'t look like many people view this product anyway.'

In [26]:
#X_train, X_test, y_train, y_test = train_test_split(pp_reviews['text'], df['label'].values, test_size=0.30)
#vectorizer = TfidfVectorizer()
#vectorizer.fit(X_train)
#X_train_tf = vectorizer.transform(X_train)
#X_train_tf = X_train_tf.toarray()
ROS = RandomOverSampler(sampling_strategy=1)
X_train_ros, y_train_ros = ROS.fit_resample(X_train, y_train)
nb = GaussianNB()
nb.fit(X_train_ros, y_train_ros)
y_preds = nb.predict(X_test)
print(accuracy_score(y_test, y_preds))

0.565483476132191


In [27]:
f1_score(y_test, y_preds)

0.6516192345436702

In [30]:
from collections import Counter
Counter(y_train_ros)

Counter({1: 2818, 0: 2818})

In [31]:
Counter(y_train)

Counter({1: 2818, 0: 992})