In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/Colab Notebooks/kaggle/Imdb movie reviews

Mounted at /gdrive
/gdrive/My Drive/Colab Notebooks/kaggle/Imdb movie reviews


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
import os
import random
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
data=pd.read_csv("./data/Train.csv")
test=pd.read_csv("./data/Test.csv")
print(data.info())
print("x"*50)
print(data.head())
print("x"*50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB
None
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx


In [4]:
data.label.value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [5]:
def preprocess(document):
  # lower
  document= document.lower()
  # tokenize
  words=nltk.tokenize.word_tokenize(document)
  # stopwords
  words=[word for word in words if word not in nltk.corpus.stopwords.words('english')]
  # lemmatize
  lemmatizer=nltk.stem.WordNetLemmatizer()
  lemmatized_words=[lemmatizer.lemmatize(word) for word in words]
  # reconstruct the document
  refined_doc=" ".join(lemmatized_words)
  return refined_doc


In [6]:
data["text_clean"]=data.text.apply(preprocess)

In [7]:
data.head()

Unnamed: 0,text,label,text_clean
0,I grew up (b. 1965) watching and loving the Th...,0,grew ( b . 1965 ) watching loving thunderbird ...
1,"When I put this movie in my DVD player, and sa...",0,"put movie dvd player , sat coke chip , expecta..."
2,Why do people who do not know what a particula...,0,people know particular time past like feel nee...
3,Even though I have great interest in Biblical ...,0,"even though great interest biblical movie , bo..."
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad army fan nothing ever change ....


## Apply predefined Glove word embeddings

In [None]:
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [None]:
glove_model = KeyedVectors.load_word2vec_format("glove.6B.100d.w2vformat.txt", binary=False)

In [14]:
def sent_vec(sent):
    sent=nltk.tokenize.word_tokenize(sent)
    wv_res = np.zeros(glove_model.vector_size)
    ctr = 1
    for w in sent:
        if w in glove_model:
            ctr += 1
            wv_res += glove_model[w]
    wv_res = wv_res/ctr
    return wv_res


# def sent_vec(sent):
#     sent=nltk.tokenize.word_tokenize(sent)
#     wv_res=[]
#     for w in sent:
#         if w in glove_model:
#          wv_res.extend(glove_model[w])
#         else:
#           wv_res.extend(np.zeros(glove_model.vector_size))
#     return wv_res

In [None]:
X=[sent_vec(text) for text in data.text]
y=data.label

In [None]:
logreg = LogisticRegression(max_iter=500, random_state=42)
logreg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = logreg.predict(X)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81     20019
           1       0.81      0.80      0.80     19981

    accuracy                           0.81     40000
   macro avg       0.81      0.81      0.81     40000
weighted avg       0.81      0.81      0.81     40000



In [None]:
test=pd.read_csv("./data/Test.csv")
test["text_clean"]=test.text.apply(preprocess)
X_test_vectorized = [sent_vec(text) for text in test.text]
y_pred = logreg.predict(X_test_vectorized)
print(classification_report(test['label'], y_pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81      2495
           1       0.81      0.80      0.81      2505

    accuracy                           0.81      5000
   macro avg       0.81      0.81      0.81      5000
weighted avg       0.81      0.81      0.81      5000



## Pretrained Word2vec

In [9]:
sentences=word2vec.Text8Corpus('text8')
model = Word2Vec(sentences)

In [19]:
def sent_vec(sent):
    sent=nltk.tokenize.word_tokenize(sent)
    wv_res = np.zeros(model.vector_size)
    ctr = 1
    for w in sent:
        if w in model:
            ctr += 1
            wv_res += model[w]
    wv_res = wv_res/ctr
    return wv_res

In [20]:
X=[sent_vec(text) for text in data.text]
y=data.label

  
  


In [21]:
logreg = LogisticRegression(max_iter=500, random_state=42)
logreg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
y_pred = logreg.predict(X)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75     20019
           1       0.75      0.75      0.75     19981

    accuracy                           0.75     40000
   macro avg       0.75      0.75      0.75     40000
weighted avg       0.75      0.75      0.75     40000



In [24]:
test=pd.read_csv("./data/Test.csv")
test["text_clean"]=test.text.apply(preprocess)
X_test_vectorized = [sent_vec(text) for text in test.text]
y_pred = logreg.predict(X_test_vectorized)
print(classification_report(test['label'], y_pred))

  
  


              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2495
           1       0.76      0.75      0.76      2505

    accuracy                           0.76      5000
   macro avg       0.76      0.76      0.76      5000
weighted avg       0.76      0.76      0.76      5000

