In [1]:
import pandas as pd
import numpy as np
import nltk
# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import RandomOverSampler
from joblib import dump, load
import pickle

In [2]:
refurbished = pd.read_excel('is_refurbished.xlsx')

In [3]:
refurbished.head()

Unnamed: 0,Region,Keywords,Predicted
0,www.amazon.co.uk,refurbished,Y
1,www.amazon.co.uk,open box,Y
2,www.amazon.co.uk,nearly new,Y
3,www.amazon.co.uk,reconditioned,Y
4,www.amazon.co.uk,remanufactured,Y


In [4]:
refurbished.columns

Index(['Region', 'Keywords', 'Predicted'], dtype='object')

In [5]:
refurbished.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Region     943 non-null    object
 1   Keywords   943 non-null    object
 2   Predicted  943 non-null    object
dtypes: object(3)
memory usage: 22.2+ KB


In [6]:
refurbished.value_counts()

Region            Keywords                      Predicted
www.amazon.it     SLOGGI BASIC+                 N            20
                  032C X SLOGGI                 N            12
www.amazon.co.uk  SLOGGI BASIC+                 N             9
www.amazon.it     SLOGGI ZERO FEEL              N             9
www.amazon.co.uk  SLOGGI BODY ADAPT             N             8
                                                             ..
www.amazon.fr     Benchmark Single Bang Blue    N             1
                  Benchmark Single Arctic Aqua  N             1
                  Benchmark Single Active Aqua  N             1
                  Bench Casual Vision Violet    N             1
www.amazon.co.uk  Ex Demo                       Y             1
Length: 498, dtype: int64

In [7]:
refurbished['Predicted'].value_counts()

N    479
Y    464
Name: Predicted, dtype: int64

In [8]:
refurbished['Predicted'].unique()

array(['Y', 'N'], dtype=object)

In [9]:
refurbished.reset_index(inplace = True) 

In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
lem = WordNetLemmatizer()
corpus_stem = []
for i in range(0, len(refurbished)):
    review = re.sub('[^a-zA-Z]', ' ', refurbished['Keywords'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_stem.append(review)

In [11]:
corpus_stem

['refurbish',
 'open box',
 'nearli new',
 'recondit',
 'remanufactur',
 'renew',
 'second hand',
 'pre own',
 'like new',
 'b stock',
 'b ware',
 'ex demo',
 'fast neu',
 'certifi refurbish',
 'sell',
 'open box',
 'condit',
 'manufactur',
 'second hand',
 'like new',
 'ex demo',
 'refurbish',
 'open box',
 'nearli new',
 'recondit',
 'remanufactur',
 'renew',
 'second hand',
 'pre own',
 'like new',
 'b stock',
 'b ware',
 'ex demo',
 'fast neu',
 'certifi refurbish',
 'sell',
 'open box',
 'condit',
 'manufactur',
 'second hand',
 'like new',
 'ex demo',
 'b ware',
 'refurbish',
 'remanufactur',
 'gener berholt',
 'neuwertig',
 'recondit',
 'like new',
 'berholt',
 'erneuert',
 'pre own',
 'tr bon tat',
 'nearli new',
 'fast neu',
 'pre own',
 'etat correct',
 'preown',
 'comm neuf',
 'b stock',
 'ex demo',
 'rigenerato',
 'renewd',
 'runderneuert',
 'like new',
 'likenew',
 'manufactur',
 'manufactur',
 'wiederaufgearbeitet',
 'vorf hrware',
 'wie neu',
 'packung ge ffnet',
 'gebra

In [12]:
corpus_stem[0]

'refurbish'

In [13]:
type(corpus_stem)

list

In [14]:
refurbished['Predicted'].value_counts()

N    479
Y    464
Name: Predicted, dtype: int64

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus_stem)

Y=refurbished['Predicted']


In [16]:
pickle.dump(cv,open('transform_new.pkl','wb'))

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

In [18]:
tfidf=TfidfTransformer()

X=tfidf.fit_transform(X)

In [19]:
print(X.shape,Y.shape)

(943, 436) (943,)


In [20]:
type(X)

scipy.sparse.csr.csr_matrix

In [21]:
pickle.dump(tfidf,open('transform.pkl','wb'))

In [22]:
ros = RandomOverSampler(random_state=0)
X_res,Y_res=ros.fit_resample(X, Y)

#pickle.dump(ros,open('randsamp.pkl','wb'))

In [23]:
Y_res.value_counts(normalize=True) * 100

N    50.0
Y    50.0
Name: Predicted, dtype: float64

In [24]:
Y_res=Y_res.replace(to_replace = ['N','Y'], value = [0,1])

In [25]:
Y_res.value_counts(normalize=True) * 100

0    50.0
1    50.0
Name: Predicted, dtype: float64

In [26]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(X_res,Y_res, test_size = 0.25, random_state = 43)


In [27]:

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB()

In [28]:
y_pred=clf.predict(x_test)

In [29]:
pickle.dump(clf,open('model.pkl','wb'))

In [30]:
y_pred

array([1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
      dtype=int64)

In [31]:
expected = y_test

from sklearn import metrics
print(metrics.classification_report(expected, y_pred))
print(metrics.confusion_matrix(expected, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       123
           1       0.96      1.00      0.98       117

    accuracy                           0.98       240
   macro avg       0.98      0.98      0.98       240
weighted avg       0.98      0.98      0.98       240

[[118   5]
 [  0 117]]
