In [87]:
#1. Tokenize
#2. Remove the stopwords
#3. Lemmatization/Stemming
#4. Document-Term Matrix -- TfIdf
#5. Model

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score,accuracy_score,roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#word_tokenize('An Apple a day keeps the doctor away.So, will I get sick soon?')
#tokenizer.tokenize('An Apple a day keeps the doctor away.So, will I get sick soon?')

In [89]:
reviews_df=pd.read_csv('Amazon_Reviews.csv')

reviews_df

Unnamed: 0,Review,Label
0,Stuning even for the non-gamer: This sound tr...,1
1,The best soundtrack ever to anything.: I'm re...,1
2,Amazing!: This soundtrack is my favorite musi...,1
3,Excellent Soundtrack: I truly like this sound...,1
4,"Remember, Pull Your Jaw Off The Floor After H...",1
...,...,...
194,A Book That Is Worth a Second Look: This book...,1
195,Best game ever: This games makes even amazing...,1
196,Guitar in Absentia: With all due respect to a...,0
197,Stiff and Smells like drying paint: You get w...,0


In [90]:
y=reviews_df['Label']

reviews_df.drop(columns='Label',inplace=True)

In [91]:
X_train,X_test,y_train,y_test=train_test_split(reviews_df,y,test_size=0.2,random_state=42)

In [92]:
tokenizer=RegexpTokenizer(r'\w+')
en_sw=stopwords.words('english')

In [93]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()
tfidf=TfidfVectorizer()

#stemmer.stem('cacti')
#lemmatizer.lemmatize('playing',pos='v')

In [94]:
def text_preprocessing(text):
    tokens=tokenizer.tokenize(text)
    pure_tokens=[token.lower() for token in tokens if token.lower() not in en_sw]
    lemmatized_tokens=[lemmatizer.lemmatize(token,pos='v') for token in pure_tokens]
    
    return lemmatized_tokens


In [95]:
X_train['Review']=X_train['Review'].apply(text_preprocessing)
X_test['Review']=X_test['Review'].apply(text_preprocessing)


X_train['Review']

183    [handful, track, hear, far, complete, though, ...
38     [work, mac, clearly, say, line, work, mac, os,...
24     [like, album, think, would, hear, song, two, t...
142    [pattern, detail, sketch, although, excite, pu...
141    [contemporary, fairytale, sure, delight, book,...
                             ...                        
106    [authentic, first, encounter, yoruba, say, cds...
14     [awful, beyond, belief, feel, write, keep, oth...
92     [omg, soulwax, own, wow, like, amaze, album, e...
179    [yet, another, unsubstantiated, case, believe,...
102    [yes, get, book, expect, much, man, wrong, lov...
Name: Review, Length: 159, dtype: object

In [96]:
train_tfidf=tfidf.fit_transform(X_train['Review'])
test_tfidf=tfidf.transform(X_test['Review'])

AttributeError: 'list' object has no attribute 'lower'

In [None]:
mnb=MultinomialNB()
mnb.fit(train_tfidf,y_train)



In [None]:
mnb_pred=mnb.predict(test_tfidf)
pos_probabs=mnb.predict_proba(test_tfidf)[::,1]

In [None]:
confusion_matrix(y_test,mnb_pred)
recall_score(y_test,mnb_pred)
precision_score(y_test,mnb_pred)


In [None]:
fpr,tpr,thresholds=roc_curve(y_test,pos_probabs)

plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

roc_auc_score(y_test,pos_probabs)

accuracy_score(y_test,mnb_pred)

In [None]:
!pip install gensim

# word2vec

In [97]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [106]:
model=Word2Vec(window=4,min_count=9,workers=4,vector_size=32)

model.build_vocab(X_train['Review'],progress_per=1000)

model.train(X_train['Review'],total_examples=model.corpus_count,epochs=model.epochs)

(6021, 33260)

In [103]:
X_train['Review']

183    [handful, track, hear, far, complete, though, ...
38     [work, mac, clearly, say, line, work, mac, os,...
24     [like, album, think, would, hear, song, two, t...
142    [pattern, detail, sketch, although, excite, pu...
141    [contemporary, fairytale, sure, delight, book,...
                             ...                        
106    [authentic, first, encounter, yoruba, say, cds...
14     [awful, beyond, belief, feel, write, keep, oth...
92     [omg, soulwax, own, wow, like, amaze, album, e...
179    [yet, another, unsubstantiated, case, believe,...
102    [yes, get, book, expect, much, man, wrong, lov...
Name: Review, Length: 159, dtype: object

In [108]:
model.wv.get_vector('soundtrack')

array([-0.02859885, -0.02536807, -0.01462234, -0.00569853,  0.000606  ,
       -0.018159  ,  0.00353976,  0.00629209, -0.01277917,  0.00869157,
        0.04178849, -0.00488488,  0.0197595 , -0.03911533,  0.03152749,
       -0.00958504,  0.0395254 ,  0.0123668 ,  0.00836818, -0.02245821,
       -0.01476104,  0.05614202,  0.01510634,  0.01341541,  0.02258139,
        0.00502274,  0.01135106,  0.00538107, -0.02550888, -0.03846185,
       -0.01234535, -0.02359255], dtype=float32)

In [109]:
X_train['Review']

183    [handful, track, hear, far, complete, though, ...
38     [work, mac, clearly, say, line, work, mac, os,...
24     [like, album, think, would, hear, song, two, t...
142    [pattern, detail, sketch, although, excite, pu...
141    [contemporary, fairytale, sure, delight, book,...
                             ...                        
106    [authentic, first, encounter, yoruba, say, cds...
14     [awful, beyond, belief, feel, write, keep, oth...
92     [omg, soulwax, own, wow, like, amaze, album, e...
179    [yet, another, unsubstantiated, case, believe,...
102    [yes, get, book, expect, much, man, wrong, lov...
Name: Review, Length: 159, dtype: object