### --------Import the libraries---------

In [20]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import string
import numpy as np
import re
#nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import tokenize

#sci-kit learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


### --------EDA-----------

In [2]:
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
from nltk.corpus import movie_reviews

In [4]:
len(movie_reviews.fileids())

2000

In [5]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [6]:
movie_reviews.fileids()[-5:]

['pos/cv995_21821.txt',
 'pos/cv996_11592.txt',
 'pos/cv997_5046.txt',
 'pos/cv998_14111.txt',
 'pos/cv999_13106.txt']

In [7]:
negative_fileids = movie_reviews.fileids('neg')
positive_fileids = movie_reviews.fileids('pos')

In [8]:
negative_fileids

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [9]:
len(negative_fileids), len(positive_fileids)

(1000, 1000)

In [10]:
print(movie_reviews.raw(fileids=positive_fileids[20]))

by phil curtolo mel gibson ( braveheart ) gave a gripping performance as the father of a young kidnapped boy in ron howard's ransom . 
gibson plays tom mullen , a wealthy business tycoon whose past actions are coming back to haunt him as a deranged psychopath , played by gary sinise ( forrest gump ) , and his band of low-life thugs kidnap his only son for $2 million . 
tom and his wife , kate , played by rene russo ( tin cup ) were instructed not to inform the police , but they contacted the fbi . 
minutes later , an elite team of agents led by delroy lindo ( broken arrow ) are in tom's house and wiring every phone . 
the plot sounds average , just like most other kidnap movies that you've already seen , and it was nothing more than that . 
that is until about half-way through the movie . 
suddenly , tom goes to the fox 5 news room and makes a live broadcast saying , " this is your ransom . 
but this is as close as you will ever get to it . 
instead , i am offering this money as a rewa

In [11]:
negative_fileids[2]

'neg/cv002_17424.txt'

In [12]:
negative_features = pd.DataFrame(
    {'review':movie_reviews.raw(fileids=[f]),'label': 'neg'} for f in negative_fileids
)

In [13]:
positive_features = pd.DataFrame(
    {'review':movie_reviews.raw(fileids=[f]),'label': 'pos'} for f in positive_fileids
)

In [14]:
data = pd.concat([positive_features, negative_features], ignore_index=True)


In [15]:
data

Unnamed: 0,review,label
0,films adapted from comic books have had plenty...,pos
1,every now and then a movie comes along from a ...,pos
2,you've got mail works alot better than it dese...,pos
3,""" jaws "" is a rare film that grabs your atten...",pos
4,moviemaking is a lot like being the general ma...,pos
...,...,...
1995,"if anything , "" stigmata "" should be taken as ...",neg
1996,"john boorman's "" zardoz "" is a goofy cinematic...",neg
1997,the kids in the hall are an acquired taste . \...,neg
1998,there was a time when john carpenter was a gre...,neg


### --------data Preprocessing--------

In [16]:
data['review'] = data['review'].str.lower()

In [17]:
data.review[2]

"you've got mail works alot better than it deserves to . \nin order to make the film a success , all they had to do was cast two extremely popular and attractive stars , have them share the screen for about two hours and then collect the profits . \nno real acting was involved and there is not an original or inventive bone in it's body ( it's basically a complete re-shoot of the shop around the corner , only adding a few modern twists ) . \nessentially , it goes against and defies all concepts of good contemporary filmmaking . \nit's overly sentimental and at times terribly mushy , not to mention very manipulative . \nbut oh , how enjoyable that manipulation is . \nbut there must be something other than the casting and manipulation that makes the movie work as well as it does , because i absolutely hated the previous ryan/hanks teaming , sleepless in seattle . \nit couldn't have been the directing , because both films were helmed by the same woman . \ni haven't quite yet figured out wh

In [18]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [22]:
data['review']=data['review'].apply(lambda x : decontracted(x))

In [23]:
punctuations = list(string.punctuation)
print(punctuations)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [24]:
import re
data['review'] = data['review'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))

In [25]:
data['review'] = data['review'].apply(lambda x: re.sub(r'[0-9]', '',x))

In [26]:
#verification
data.review[2]

'you have got mail works alot better than it deserves to  \nin order to make the film a success  all they had to do was cast two extremely popular and attractive stars  have them share the screen for about two hours and then collect the profits  \nno real acting was involved and there is not an original or inventive bone in it is body  it is basically a complete reshoot of the shop around the corner  only adding a few modern twists   \nessentially  it goes against and defies all concepts of good contemporary filmmaking  \nit is overly sentimental and at times terribly mushy  not to mention very manipulative  \nbut oh  how enjoyable that manipulation is  \nbut there must be something other than the casting and manipulation that makes the movie work as well as it does  because i absolutely hated the previous ryanhanks teaming  sleepless in seattle  \nit could not have been the directing  because both films were helmed by the same woman  \ni have not quite yet figured out what i liked so 

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
stop = stopwords.words('english')

In [29]:
print('Total stop words:',len(stop))

Total stop words: 179


In [30]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop])

data['review'] = data['review'].apply(lambda x: remove_stopwords(x))

In [31]:
data["review"] = data["review"].apply(lambda x: re.sub(' +', ' ', x))

In [32]:
data["review"] = data["review"].apply(lambda x: re.sub(r'aa+', 'a', x))

In [33]:
#verification
data.review[2]

'got mail works alot better deserves order make film success cast two extremely popular attractive stars share screen two hours collect profits real acting involved original inventive bone body basically complete reshoot shop around corner adding modern twists essentially goes defies concepts good contemporary filmmaking overly sentimental times terribly mushy mention manipulative oh enjoyable manipulation must something casting manipulation makes movie work well absolutely hated previous ryanhanks teaming sleepless seattle could directing films helmed woman quite yet figured liked much got mail really important like something much even question storyline cliched come tom hanks plays joe fox insanely likeable owner discount book chain meg ryan plays kathleen kelley even insanely likeable proprietor familyrun children book shop called nice homage shop around corner fox kelley soon become bitter rivals new fox books store opening right across block small business little know already love

In [34]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
data['review'] = data['review'].apply(lambda x: word_tokenize(x))

In [36]:
data['review']

0       [films, adapted, comic, books, plenty, success...
1       [every, movie, comes, along, suspect, studio, ...
2       [got, mail, works, alot, better, deserves, ord...
3       [jaws, rare, film, grabs, attention, shows, si...
4       [moviemaking, lot, like, general, manager, nfl...
                              ...                        
1996    [john, boorman, zardoz, goofy, cinematic, deba...
1997    [kids, hall, acquired, taste, took, least, sea...
1998    [time, john, carpenter, great, horror, directo...
1999    [two, party, guys, bob, heads, haddaway, dance...
Name: review, Length: 2000, dtype: object

In [37]:
stemmer = PorterStemmer()
def stem_words(list):
    return ([stemmer.stem(word) for word in list])

data["review"] = data["review"].apply(lambda x: stem_words(x))

In [38]:
data['review']

0       [film, adapt, comic, book, plenti, success, wh...
1       [everi, movi, come, along, suspect, studio, ev...
2       [got, mail, work, alot, better, deserv, order,...
3       [jaw, rare, film, grab, attent, show, singl, i...
4       [moviemak, lot, like, gener, manag, nfl, team,...
                              ...                        
1995    [anyth, stigmata, taken, warn, releas, similar...
1996    [john, boorman, zardoz, goofi, cinemat, debacl...
1997    [kid, hall, acquir, tast, took, least, season,...
1998    [time, john, carpent, great, horror, director,...
1999    [two, parti, guy, bob, head, haddaway, danc, h...
Name: review, Length: 2000, dtype: object

In [39]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(list):
    return ([lemmatizer.lemmatize(word) for word in list])
data["review"] = data["review"].apply(lambda text: lemmatize_words(text))

In [41]:
data['review'] = data["review"].apply(lambda x : " ".join(x))

##### Split data

In [42]:
from sklearn.model_selection import train_test_split
dataTrain, dataTest = train_test_split(data,train_size=0.8,random_state=1,stratify=data['label'])

In [43]:
dataTest.groupby(['label']).size()

label
neg    200
pos    200
dtype: int64

In [44]:
dataTrain.groupby(['label']).size()

label
neg    800
pos    800
dtype: int64

__Bag of Words using CountVectorizer:__


In [45]:
count_vector = CountVectorizer(stop_words='english',min_df = 5) 

In [46]:
X = count_vector.fit_transform(dataTrain['review'])

In [47]:
bow_vectorizer_features = X.toarray()
bow_frequency_matrix = pd.DataFrame(bow_vectorizer_features,columns=count_vector.get_feature_names())
bow_frequency_matrix

Unnamed: 0,abandon,abbi,abduct,abil,abl,abli,aboard,abod,abort,abound,...,zest,zetajon,zinger,zipper,zombi,zone,zoo,zoom,zucker,zwick
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1596,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1597,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1598,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
print(len(count_vector.get_feature_names()))

8400


### ---------Building Classifier----------

#### KNN Classifier

In [49]:
#import the class KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

#instatiate the object
knn_classifier = KNeighborsClassifier(n_neighbors=2)

#perform the training process
knn_classifier.fit(bow_frequency_matrix,dataTrain['label'])



KNeighborsClassifier(n_neighbors=2)

In [50]:
#generate the document term matrix for the test set
#using the object learned from the train set
#import the metrics class for the performance measurement
from sklearn import metrics

#create the document term matrix
bow_test = count_vector.transform(dataTest['review'])

#prediction for the test set
bow_pred = knn_classifier.predict(bow_test)

#confusion matrix
print('***Confusion matrix')
print(metrics.confusion_matrix(dataTest['label'],bow_pred))

***Confusion matrix
[[187  13]
 [166  34]]


In [51]:
#recall
print('Recall')
print(metrics.recall_score(dataTest['label'],bow_pred,pos_label='pos'))

#precision
print('precision')
print(metrics.precision_score(dataTest['label'],bow_pred,pos_label='pos'))

#F1-Score
print('F1-Score')
print(metrics.f1_score(dataTest['label'],bow_pred,pos_label='pos'))

#accuracy rate
print('accuracy rate -->')
print(metrics.accuracy_score(dataTest['label'],bow_pred))

Recall
0.17
precision
0.723404255319149
F1-Score
0.27530364372469635
accuracy rate -->
0.5525


#### RandomForest Classifier

In [52]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()

#perform the training process
rf.fit(bow_frequency_matrix,dataTrain['label'])


tf_test = count_vector.transform(dataTest['review'])

#prediction for the test set
tf_pred = rf.predict(tf_test)

#confusion matrix
print('***Confusion matrix')
mcTestBis = metrics.confusion_matrix(dataTest['label'],tf_pred)
print(mcTestBis)

#recall
print('Recall')
print(metrics.recall_score(dataTest['label'],tf_pred,pos_label='pos'))

#precision
print('precision')
print(metrics.precision_score(dataTest['label'],tf_pred,pos_label='pos'))

#F1-Score
print('F1-Score')
print(metrics.f1_score(dataTest['label'],tf_pred,pos_label='pos'))

#accuracy rate
print('accuracy rate -->')
print(metrics.accuracy_score(dataTest['label'],tf_pred))

***Confusion matrix
[[166  34]
 [ 36 164]]
Recall
0.82
precision
0.8282828282828283
F1-Score
0.8241206030150754
accuracy rate -->
0.825


#### Gradient Boost Classifier

In [53]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
#perform the training process
gbc.fit(bow_frequency_matrix,dataTrain['label'])


tf_test = count_vector.transform(dataTest['review'])

#prediction for the test set
tf_pred = gbc.predict(tf_test)

#confusion matrix
print('***Confusion matrix')
mcTestBis = metrics.confusion_matrix(dataTest['label'],tf_pred)
print(mcTestBis)

#recall
print('Recall')
print(metrics.recall_score(dataTest['label'],tf_pred,pos_label='pos'))

#precision
print('precision')
print(metrics.precision_score(dataTest['label'],tf_pred,pos_label='pos'))

#F1-Score
print('F1-Score')
print(metrics.f1_score(dataTest['label'],tf_pred,pos_label='pos'))

#accuracy rate
print('accuracy rate -->')
print(metrics.accuracy_score(dataTest['label'],tf_pred))

***Confusion matrix
[[161  39]
 [ 44 156]]
Recall
0.78
precision
0.8
F1-Score
0.7898734177215191
accuracy rate -->
0.7925


#### Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression

#instatiate the object
LR = LogisticRegression()

#perform the training process
LR.fit(bow_frequency_matrix,dataTrain['label'])


tf_test = count_vector.transform(dataTest['review'])

#prediction for the test set
tf_pred = LR.predict(tf_test)

#confusion matrix
print('***Confusion matrix')
mcTestBis = metrics.confusion_matrix(dataTest['label'],tf_pred)
print(mcTestBis)

#recall
print('Recall')
print(metrics.recall_score(dataTest['label'],tf_pred,pos_label='pos'))

#precision
print('precision')
print(metrics.precision_score(dataTest['label'],tf_pred,pos_label='pos'))

#F1-Score
print('F1-Score')
print(metrics.f1_score(dataTest['label'],tf_pred,pos_label='pos'))

#accuracy rate
print('accuracy rate -->')
print(metrics.accuracy_score(dataTest['label'],tf_pred))

***Confusion matrix
[[173  27]
 [ 38 162]]
Recall
0.81
precision
0.8571428571428571
F1-Score
0.832904884318766
accuracy rate -->
0.8375


### --------Dimensionality Reduction---------

In [55]:
coef_abs = np.abs(LR.coef_[0,:])

In [56]:
thresholds = np.percentile(coef_abs,[0,25,50,75,90,100])
print(thresholds)

[4.43532876e-06 1.76406495e-02 3.94395699e-02 7.80641686e-02
 1.35725669e-01 8.16433792e-01]


In [57]:
indices = np.where(coef_abs > thresholds[2])
print(len(indices[0]))

4200


In [58]:
mdtTrainTer = bow_vectorizer_features[:,indices[0]]#toutes les lignes et uniquement les colones 
mdtTestTer = bow_test[:,indices[0]]

#checking
print(mdtTrainTer.shape)
print(mdtTestTer.shape)

(1600, 4200)
(400, 4200)


In [59]:
#instatiate the object
modelTer = LogisticRegression()

#train a new classifier with selected terms
modelTer.fit(mdtTrainTer,dataTrain['label'])

#prediction on the test set
predTestTer = modelTer.predict(mdtTestTer)

#confusion matrix
mcTestTer = metrics.confusion_matrix(dataTest['label'],predTestTer)
print(mcTestTer)

[[173  27]
 [ 37 163]]


### -------Deployement---------

In [60]:
doc = ["This was so bad I couldn't finish it. The actresses are so bad at acting it feels like a bad comedy from minute one. The high rated reviews is obviously from friend/family and is pure BS."]
#document preprocessing
doc = decontracted(str(doc))
doc = str(doc).lower()
doc = re.sub('[%s]' % re.escape(string.punctuation), '' , str(doc))
doc = re.sub(r'[0-9]', '',str(doc))
doc  = re.sub(' +', ' ', str(doc))
doc = remove_stopwords(str(doc))

#get its description
desc = count_vector.transform([doc])
print(desc)

  (0, 497)	3
  (0, 2736)	1
  (0, 3445)	1
  (0, 4291)	1
  (0, 5828)	1


In [61]:
#dense representation
dense_desc = desc.toarray()

#apply var. selection
dense_sel = dense_desc[:,indices[0]]

In [62]:
dense_desc 

array([[0, 0, 0, ..., 0, 0, 0]])

In [63]:
pred_doc = modelTer.predict(dense_sel)
print(pred_doc)

['neg']


In [64]:
pred_proba = modelTer.predict_proba(dense_sel)
print(pred_proba)

[[0.97513163 0.02486837]]
