In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
df = pd.read_csv('./sets\dataset\dataset.csv',header=None)

In [4]:
df.columns=['rating','title','reviews']

In [5]:
df['reviews'] = df['reviews']+df['title']

In [6]:
df = df.drop('title',axis='columns')

In [9]:
df.to_csv('amazon.csv')

In [8]:
df = df.fillna(" ")

In [78]:
lemmatizer = WordNetLemmatizer()
def lemmatize(content):
    content = str(content)
    content = re.sub('[^A-Za-z]',' ',content)
    content = re.sub('[\"\'\|\?\=\.\@\#\*\,]',' ',content)
    lemmed_content = content.lower()
    lemmed_content = lemmed_content.split()
    lemmed_content = [lemmatizer.lemmatize(word) for word in lemmed_content if not word in stopwords.words('english')]
    lemmed_content = '  '.join(lemmed_content)
    return lemmed_content

In [13]:
df

Unnamed: 0,rating,reviews,reviews_new
0,2,My lovely Pat has one of the GREAT voices of h...,lovely pat one great voice generation li...
1,2,Despite the fact that I have only played a sma...,despite fact played small portion game m...
2,1,I bought this charger in Jul 2003 and it worke...,bought charger jul worked ok design nice...
3,2,Check out Maha Energy's website. Their Powerex...,check maha energy website powerex mh c ...
4,2,Reviewed quite a bit of the combo players and ...,reviewed quite bit combo player hesitant ...
...,...,...,...
399995,1,We bought this Thomas for our son who is a hug...,bought thomas son huge thomas fan huge ...
399996,1,My son recieved this as a birthday gift 2 mont...,son recieved birthday gift month ago lov...
399997,1,"I bought this toy for my son who loves the ""Th...",bought toy son love thomas toy need one...
399998,2,This is a compilation of a wide range of Mitfo...,compilation wide range mitford article be...


In [11]:
df['reviews_new'] = df['reviews'].apply(lemmatize)
df.to_csv('lemma_amazon.csv')

In [100]:
df = pd.read_csv('lemma_amazon.csv')
df = df.drop([df.columns[0],'reviews'],axis=1)

In [101]:
df = df.dropna()

In [102]:
negative = df[df['rating']==1]
positive = df[df['rating']==2]

In [103]:
negative_train = negative.sample(frac=0.50,random_state=12)
negative_test = negative.drop(negative_train.index)
positive_train = positive.sample(frac=0.50,random_state=12)
positive_test = negative.drop(negative_train.index)

In [104]:
train_set = pd.concat([positive_train,negative_train])
test_set = pd.concat([positive_test,negative_test])

In [105]:
def reducer(content):
    content = content.split()
    content = set(content)
    content = list(content)
    content = '  '.join(content)
    return content

In [106]:
train_set['reviews_new'] = train_set['reviews_new'].apply(reducer)
test_set['reviews_new'] = test_set['reviews_new'].apply(reducer)
train_set.to_csv('train_amazon.csv',index=False)
test_set.to_csv('test_amazon.csv',index=False)

In [57]:
train_data = pd.read_csv('train_amazon.csv')

In [61]:
# separating the data and label
X = train_data['reviews_new'].values
Y = train_data['rating'].values

In [62]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [110]:
X.shape[0]

199994

In [5]:
model = GaussianNB()

In [None]:
step = 5000
for i in range(0,X.shape[0],step):
   train_input_X = X[i:i+step].toarray()
   train_input_Y = Y[i:i+step]
   model.partial_fit(train_input_X,train_input_Y,classes=[1,2])

In [9]:
amazonmodel = open('amazonmodel_original.pkl','wb')
pickle.dump(model,amazonmodel)

In [70]:
amazonmodel = open('amazonmodel_original.pkl','rb')
model = pickle.load(amazonmodel)

In [71]:
model

In [51]:
test_data = pd.read_csv('test_amazon.csv')
test_data

Unnamed: 0,rating,reviews_new
0,1,giving disc p player recorder stopped go...
1,1,topic done format book enjoyed firstly e...
2,1,access want listen duke gimmick orchestra...
3,1,open package smell dont worn able havent...
4,1,install brain everything beautifully purch...
...,...,...
199989,1,stuff going dont cheap knew never person...
199990,1,maybe book lucky casino others began tho...
199991,1,upset package son else retire better cou...
199992,1,meant kitchen officially train fall birth...


In [52]:
X_test = test_data['reviews_new'].values
Y_test = test_data['rating'].values

In [13]:
X_test = vectorizer.transform(X_test)

In [14]:
step = 5000
score = []
for i in range(0,X_test.shape[0],step):
   test_input_X = X_test[i:i+step].toarray()
   test_input_Y = Y_test[i:i+step]
   score.append(model.predict(test_input_X))

In [38]:
predicted = []
for i in score:
    predicted = predicted+list(i)

In [41]:
predicted = np.asarray(predicted)

In [44]:
predicted_data_file = open('score_amazon.pkl','wb')
pickle.dump(predicted,predicted_data_file)

In [46]:
predicted_data_file = open('score_amazon.pkl','rb')
predicted_data = pickle.load(predicted_data_file)

In [54]:
test_data_accuracy = accuracy_score(predicted_data,Y_test)

In [55]:
test_data_accuracy

0.8724761742852286

In [79]:
text = input("Enter your text to be predicted ")
text = lemmatize(text)
X_to_be_predicted = vectorizer.transform([text])
prediction = model.predict(X_to_be_predicted.toarray())
if prediction[0] == 1:
    print('It is a negative review')
elif prediction[0]==2:
    print('It is a positive review')

Enter your text to be predicted This is a great product. Rreally liked it
It is a positive review
