In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-fine-food-reviews/hashes.txt
/kaggle/input/amazon-fine-food-reviews/Reviews.csv
/kaggle/input/amazon-fine-food-reviews/database.sqlite


### Importing Libraries

In [7]:
import warnings
warnings.filterwarnings('ignore')

# for data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# for nlp tasks
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stop_words=set(nltk.corpus.stopwords.words('english'))
# for part-of-speech tagging
from nltk import pos_tag

# for named entity recognition
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# BeautifulSoup Library
from bs4 import BeautifulSoup

# regex
import re

# model_selection
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV

# model_evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# for data preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# tensorflow.keras
from tensorflow import keras
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Embedding, Input, CuDNNLSTM, LSTM
from keras.preprocessing.text import text_to_word_sequence

import gensim

In [8]:
org_df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv')

In [9]:
org_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Data Cleaning And Pre-Procesing

In [10]:
df = org_df.copy()
df = df[['Text', 'Score']]

In [11]:
df.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [12]:
df.shape

(568454, 2)

In [13]:
df['review'] = df['Text']
df['rating'] = df['Score']
df.drop(['Text', 'Score'], axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,review,rating
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [15]:
print(df['review'].isnull().sum())
print(df['rating'].isnull().sum())

0
0


In [16]:
df.duplicated().sum()

174779

In [17]:
df.drop_duplicates(['rating','review'],keep='first',inplace=True)

In [18]:
print(df.shape)

(393675, 2)


In [19]:
for review in df['review'][:2]:
    print(review, end='\n\n')

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.

Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".



In [20]:
# Convert the values in score column to sentiment. score less than 3 sentiment will be 0 and greater than 3 will be 1
def mark_sentiment(rating):
    if(rating<=3):
        return 0
    else:
        return 1

In [21]:
df['sentiment'] = df['rating'].apply(mark_sentiment)

In [22]:
df.drop(['rating'], axis=1, inplace=True)

In [23]:
df.head()

Unnamed: 0,review,sentiment
0,I have bought several of the Vitality canned d...,1
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,1
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,1


In [24]:
df['sentiment'].value_counts()

1    306819
0     86856
Name: sentiment, dtype: int64

### Clean and Preprocess the text

In [25]:
# to clean and preprocess the text
def clean_reviews(review):
    review_text = BeautifulSoup(review).get_text()
    
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    word_tokens = review_text.lower().split()
    
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review = " ".join(word_tokens)
    return cleaned_review

#### Data is imbalanced. I can use the oversampling or undersampling or SMOTE method to balance the data. Here, I will take only 100K reviews. To balance the class i have taken equal instances of each sentiment

In [26]:
pos_df = df.loc[df.sentiment==1][:50000]
neg_df = df.loc[df.sentiment==0][:50000]

In [27]:
pos_df.head()

Unnamed: 0,review,sentiment
0,I have bought several of the Vitality canned d...,1
2,This is a confection that has been around a fe...,1
4,Great taffy at a great price. There was a wid...,1
5,I got a wild hair for taffy and ordered this f...,1
6,This saltwater taffy had great flavors and was...,1


In [28]:
neg_df.head()

Unnamed: 0,review,sentiment
1,Product arrived labeled as Jumbo Salted Peanut...,0
3,If you are looking for the secret ingredient i...,0
12,My cats have been happily eating Felidae Plati...,0
16,I love eating them and they are good for watch...,0
26,"The candy is just red , No flavor . Just plan...",0


In [29]:
# combining 
df = pd.concat([pos_df, neg_df])

In [30]:
df.sample(5)

Unnamed: 0,review,sentiment
66786,Nature's Path Love Crunch Premium Organic Gran...,1
266411,This stuff is a horror show for those concerne...,0
12431,"It tastes perfect. I like it.<br />You would ""...",1
73069,The taste reminds me of when you accidentally ...,0
26921,I'm new to this Emergen-C and I just love them...,1


In [31]:
df.shape

(100000, 2)

In [32]:
# shuffling records
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,review,sentiment
0,"This wine doesn't taste like anything, except ...",0
1,I get the best deals on Amazon and this tea is...,1
2,We have really enjoyed the old time taste of t...,1
3,"My super-picky, unbelievably finicky cat who i...",1
4,Well I was excited to order this product becau...,0


In [33]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [34]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [35]:
sentences = []
sum = 0
for review in df['review']:
    sents = tokenizer.tokenize(review.strip())
    sum += len(sents)
    
    for sent in sents:
        cleaned_sent = clean_reviews(sent)
        sentences.append(cleaned_sent.split())

print(sum)
print(len(sentences))    

511928
511928


In [36]:
for l in sentences[:4]:
    print(l, '\n')

['wine', 'taste', 'like', 'anything', 'except', 'maybe', 'water'] 

['believe', 'spent'] 

['think', 'possible', 'create', 'decent', 'pinot', 'grigio', 'every', 'one', 'ever', 'tried', 'disappointment'] 

['get', 'best', 'deal', 'amazon', 'tea', 'delicious', 'price', 'great', 'cannot', 'find', 'product', 'store'] 



### Word2Vec Model

In [37]:
# Word2Vec model from gensim
w2v_model = gensim.models.Word2Vec(sentences=sentences, vector_size=300,window=10, min_count=1)

In [38]:
w2v_model.train(sentences, epochs=10, total_examples=len(sentences))

(38407391, 41193730)

In [39]:
# vector of a particular word
w2v_model.wv.get_vector('amazing')

array([ 0.38218686,  0.47354338, -0.16858932,  0.33351246, -1.1543357 ,
        0.1228959 ,  0.80239844,  0.43674588, -0.69335645, -1.1564218 ,
        0.62056434,  0.29812953, -1.3120209 , -0.25840887, -1.5153406 ,
        0.3239301 ,  0.95363253,  0.21843493, -0.97583115,  0.02715134,
        0.49432042, -0.22190171, -0.4117768 , -0.27011132,  0.28166473,
        0.46466547,  0.6948115 , -0.29815346,  0.39281246, -1.4002353 ,
        0.98036027, -0.5365702 ,  0.4921902 ,  0.70148903,  0.41003242,
       -0.87501884,  0.6063093 , -0.8798579 , -1.0687596 ,  0.48598105,
        0.02553193, -0.53637075, -0.45075858, -0.14837632,  0.58999693,
        0.23911832,  1.9493111 , -0.07826629,  0.13944367, -0.9267551 ,
        0.14117686,  0.0776301 ,  0.8138881 ,  0.4307642 , -0.12756644,
        1.0568159 , -0.33732596,  0.6809215 , -0.44757965,  1.2338984 ,
        0.5375441 , -0.02519338,  0.7793989 ,  0.06448328, -0.9446712 ,
       -0.6540778 , -0.91699743, -0.8642268 , -0.35724518, -0.52

In [40]:
vocab = w2v_model.wv.get_vecattr('amazing', 'count')
vocab

1733

In [41]:
w2v_model.wv.most_similar('amazing')

[('wonderful', 0.5592583417892456),
 ('awesome', 0.5366454720497131),
 ('fantastic', 0.5348058342933655),
 ('incredible', 0.5273038148880005),
 ('fabulous', 0.5091261267662048),
 ('terrific', 0.5010251402854919),
 ('great', 0.4999437928199768),
 ('divine', 0.4644218683242798),
 ('delicious', 0.4577743411064148),
 ('unbeatable', 0.436774879693985)]

In [42]:
# total no of extractd words
print("Total number of words are : ", len(w2v_model.wv))

Total number of words are :  56379


In [43]:
# similartiy between two words
w2v_model.wv.similarity('great', 'amazing')

0.49994376

### Preparing data to fit into keras embedding layer

In [44]:
df['clean_review'] = df['review'].apply(clean_reviews)

In [45]:
# finding max len of any document
mx = -1
for i, rv in enumerate(df['clean_review']):
    tokens = rv.split()
    if(len(tokens)>mx):
        mx = len(tokens)
print(mx)

1564


In [46]:
tok = Tokenizer()
tok.fit_on_texts(df['clean_review'])
vocab_size = len(tok.word_index) + 1  # total no of unique words
encd_rv = tok.texts_to_sequences(df['clean_review']) # len of every doc after padding

In [47]:
mx_rv_len = mx # mx length of review
embed_dim = 300 # embedding dimensions as choosen in word2vec

### padding = some of the review has small length but we need same length of all reviews so we are using padding 
#### Ex. 
        if word = [1, 2, 3] and max_len = 5 
        then after padding word = [1, 2, 3, 0, 0]

In [48]:
pad_rev = pad_sequences(encd_rv, maxlen=mx_rv_len, padding='post') # post means 0 will be added to last
pad_rev.shape

(100000, 1564)

In [49]:
word_vec_dict = {}
ind = w2v_model.wv.key_to_index['great']


In [50]:
# creating embedding matrix
embed_matrix = np.zeros(shape=(vocab_size, embed_dim))
for word, i in tok.word_index.items():
    embed_vector = w2v_model.wv[word]
    if embed_vector is not None:
        embed_matrix[i] = embed_vector

### Preparing Train and Validation sets

In [51]:
y = keras.utils.to_categorical(df['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(pad_rev, y, test_size=0.2, random_state=21)

In [52]:
X_train.shape

(80000, 1564)

In [53]:
X_test.shape

(20000, 1564)

### Building model and performing Text Classification

In [54]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=mx_rv_len,embeddings_initializer=Constant(embed_matrix)))
# model.add(CuDNNLSTM(64, return_sequences=False))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
# model.add(Dense(16, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))

2022-08-30 10:45:52.120743: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 10:45:52.234948: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 10:45:52.235704: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 10:45:52.237181: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [55]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1564, 300)         16914000  
_________________________________________________________________
flatten (Flatten)            (None, 469200)            0         
_________________________________________________________________
dense (Dense)                (None, 16)                7507216   
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 34        
Total params: 24,421,250
Trainable params: 24,421,250
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['accuracy'])

In [57]:
epochs = 5
batch_size = 64

In [58]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

2022-08-30 10:45:55.637929: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff63ef4d050>