In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/Colab Notebooks/kaggle/Imdb movie reviews

Mounted at /gdrive
/gdrive/My Drive/Colab Notebooks/kaggle/Imdb movie reviews


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import random
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
data=pd.read_csv("./data/Train.csv")
print(data.info())
print("x"*50)
print(data.head())
print("x"*50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB
None
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx


In [4]:
data.label.value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [5]:
def preprocess(document):
  # lower
  document= document.lower()
  # tokenize
  words=nltk.tokenize.word_tokenize(document)
  # stopwords
  words=[word for word in words if word not in nltk.corpus.stopwords.words('english')]
  # lemmatize
  lemmatizer=nltk.stem.WordNetLemmatizer()
  lemmatized_words=[lemmatizer.lemmatize(word) for word in words]
  # reconstruct the document
  refined_doc=" ".join(lemmatized_words)
  return refined_doc


In [6]:
data["text_clean"]=data.text.apply(preprocess)

In [7]:
data.head()

Unnamed: 0,text,label,text_clean
0,I grew up (b. 1965) watching and loving the Th...,0,grew ( b . 1965 ) watching loving thunderbird ...
1,"When I put this movie in my DVD player, and sa...",0,"put movie dvd player , sat coke chip , expecta..."
2,Why do people who do not know what a particula...,0,people know particular time past like feel nee...
3,Even though I have great interest in Biblical ...,0,"even though great interest biblical movie , bo..."
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad army fan nothing ever change ....


In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['text_clean'],data['label'], test_size = 0.2, random_state = 1,shuffle=True)

In [9]:
X_train.head()

21721    come surprise larisa shepitko married elem kli...
15576    dragged movie four year ago french actress fri...
28716    movie never going list top 50 film time , 're ...
204      though watched salò , know excrement taste tar...
36677    think almost need say . feel obliged explain a...
Name: text_clean, dtype: object

In [10]:
vectorizer = TfidfVectorizer()
vectorizer.fit(data['text_clean'])
X_train = vectorizer.transform(X_train)

In [None]:
# print(X_train)

In [11]:
from sklearn import svm
svm = svm.SVC(C=10)
svm.fit(X_train, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
from sklearn.metrics import confusion_matrix,classification_report
X_test = vectorizer.transform(X_test)
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4035
           1       0.88      0.91      0.90      3965

    accuracy                           0.90      8000
   macro avg       0.90      0.90      0.90      8000
weighted avg       0.90      0.90      0.90      8000



In [15]:
# test

In [16]:
test=pd.read_csv("./data/Test.csv")
test["text_clean"]=test.text.apply(preprocess)
X_test_vectorized = vectorizer.transform(test["text_clean"])
y_pred = svm.predict(X_test_vectorized)
print(classification_report(test['label'], y_pred))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2495
           1       0.90      0.91      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



## Words that are statistically significant

In [21]:
from sklearn.feature_selection import chi2
tfidf=vectorizer.fit_transform(data['text_clean'])
chi2_scores=chi2(tfidf,data['label'])[0]

In [57]:
scores=list(zip(vectorizer.get_feature_names(),chi2_scores))
scores=sorted(scores, key=lambda x:x[1])
scores[:-20:-1]
# scores[-20:]

[('bad', 245.74946533135835),
 ('worst', 209.17129851156676),
 ('waste', 146.55048664791832),
 ('awful', 131.87434406704767),
 ('great', 122.35571406739234),
 ('terrible', 114.51830667691625),
 ('boring', 90.94733996234807),
 ('horrible', 89.72861212206871),
 ('stupid', 87.28801263982967),
 ('excellent', 86.71489451816946),
 ('worse', 84.21741907987365),
 ('wonderful', 76.86919408853947),
 ('poor', 70.21250456369941),
 ('nothing', 62.82832456649747),
 ('crap', 61.900811093641906),
 ('minute', 60.84110826886252),
 ('poorly', 58.50883873043769),
 ('loved', 56.73853585046163),
 ('love', 56.62541966396037)]

In [44]:
# print(tfidf)

In [59]:
# ptest
tfidf=vectorizer.fit_transform(data['text_clean'])
p_scores=chi2(tfidf,data['label'])[1]
values=list(zip(vectorizer.get_feature_names(),p_scores))
values=sorted(values, key=lambda x:x[1])
values[:20]

[('bad', 2.1934402634347138e-55),
 ('worst', 2.0828914829937113e-47),
 ('waste', 9.840067484297108e-34),
 ('awful', 1.593958127794119e-30),
 ('great', 1.9294899703938536e-28),
 ('terrible', 1.0033566652635884e-26),
 ('boring', 1.4754705268822413e-21),
 ('horrible', 2.731756069409688e-21),
 ('stupid', 9.381478133471248e-21),
 ('excellent', 1.2534921060350062e-20),
 ('worse', 4.432480980657416e-20),
 ('wonderful', 1.826642834828042e-18),
 ('poor', 5.324783024126789e-17),
 ('nothing', 2.2553201220791546e-15),
 ('crap', 3.612007371412505e-15),
 ('minute', 6.187233367359084e-15),
 ('poorly', 2.0237970556811188e-14),
 ('loved', 4.977844228635051e-14),
 ('love', 5.2725829676609273e-14),
 ('money', 8.07286950254579e-14)]

In [43]:
test_sen=preprocess("It's one time watch")
# test_sen_vectorized = vectorizer.transform(pd.Series(test_sen))
test_sen_vectorized = vectorizer.transform(np.array([test_sen]))
svm.predict(test_sen_vectorized)

array([1])