In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

### Text Preprocessing

#### lowercasing

In [9]:
data1 = data.copy()
data1 = data1["review"].str.lower()
data1 = pd.concat([data1, data["sentiment"]], axis = "columns")
data1.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


#### removing html tags

In [11]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [12]:
remove_html_tags("<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>")

' Movie 1 Actor - Aamir Khan Click here to download'

In [13]:
data1["review"] = data1["review"].apply(remove_html_tags)
data1.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


#### Removing URLs

In [15]:
i = 1
for elem in data1["review"]:
    if "www." in elem:
        if ".com" in elem:
            print(elem)
            i += 1
    if i ==3:
        break

i really think i should make my case and have every(horror and or cult)movie-buff go and see this movie...i did!it-is-excellent: very atmospheric and unsettling and scary...incridible how they could make such a gem of a film with the very low(read-"no"!)-budget they had....synopsis taken from website: "one morning, an old man wanders out into the woods in search of his runaway cat. he finds instead a child without parents and a murder with no corpse..."on this website(imdb) there is no trailer, but i will leave a link here to the site of the movie itself where there is a trailer which is quite unsettling so please go and check it out...www.softfordigging.com
this show has to be my favorite out of all the 80's horror tv shows. like tales from the darkside, also from the same creators, this show is a rare gem. if you agree with me, please sign this petition i started, to get the word out for monsters and get it out on dvd. here is the petition address: www.petitiononline.com/19784444/pet

In [16]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [17]:
remove_url("nisarg_web www.mysite.com presents ")

'nisarg_web  presents '

In [18]:
data1["review"] = data1["review"].apply(remove_url)

In [19]:
i = 1
for elem in data1["review"]:
    if "www." in elem:
        if ".com" in elem:
            print(elem)
            i += 1
    if i ==3:
        break

#### Removing punctuations

In [21]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
exclude = list(string.punctuation)

In [23]:
def remove_punc(text):
    for elem in exclude:
        text = text.replace(elem,'')
    return text

In [24]:
start = time.time()
print(remove_punc("[,sdno ,smdfsl,.sdf"))
time1 = time.time() - start
print(time1*50000)

sdno smdfslsdf
0.0


In [25]:
data1["review"] = data1["review"].apply(remove_punc)
data1.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
5,probably my alltime favorite movie a story of ...,positive
6,i sure would like to see a resurrection of a u...,positive
7,this show was an amazing fresh innovative ide...,negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


#### Chat word treatment

In [27]:
# for elem in data1.review:
#     if "afaik" in elem:
#         temp = elem.replace("afaik", "AAAFFFAAAIIIIKKKK")
#         print(temp)
#     if "lmao" in elem:
#         temp = elem.replace("lmao", "LLLLMMMMAAAAOOOO")
#         print(temp)

In [28]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

#### Spellcheck

In [30]:
from textblob import TextBlob

In [31]:
TextBlob("the movvie wass exxtremely verryy good").correct().string

'the movie was extremely very good'

In [32]:
i = 0
for elem in data1.review:
    if "goodd" in elem:
        # print("AAAAA")
        i += 1
print(i)

16


In [33]:
# data1['review'] = data1['review'].apply(lambda x: str(TextBlob(x).correct()))

In [34]:
i = 0
for elem in data1.review:
    if "goodd" in elem:
        # print("AAAAA")
        i += 1
print(i)

16


#### Stopword Removal(Note - created a separate copy if POS to be applied)

In [82]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [84]:
data2 = data1.copy()

In [86]:
def remove_stopwords(sentence):
    # Split the sentence into words
    words = sentence.split()
    
    # Filter out stopwords
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join the words back into a sentence
    return ' '.join(filtered_words)

In [88]:
start = time.time()
remove_stopwords("A and the are some of the stopwords in english language")
time1 = time.time() - start
print(time1*50000)

0.0


In [92]:
data2["review"] = data2["review"].apply(remove_stopwords)

In [142]:
data2.head(10)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
5,probably alltime favorite movie story selfless...,positive
6,sure would like see resurrection dated seahunt...,positive
7,show amazing fresh innovative idea 70s first a...,negative
8,encouraged positive comments film looking forw...,negative
9,like original gut wrenching laughter like movi...,positive


#### Tokenization

In [100]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [102]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [104]:
data3 = data2.copy()

In [108]:
data3["review"] = data3["review"].apply(word_tokenize)

In [110]:
data3.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive


#### lemmatization

In [124]:
from nltk import WordNetLemmatizer
wnl = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...


True

In [132]:
def lemma_func(word_list):
    for i in range(len(word_list)):
        word_list[i] = wnl.lemmatize(word_list[i])
    return word_list

In [140]:
lemma_func(["watching","churches","kings"])

['watching', 'church', 'king']

In [136]:
data3["review"] = data3["review"].apply(lemma_func)

In [144]:
data3.head(10)

Unnamed: 0,review,sentiment
0,"[one, reviewer, mentioned, watching, 1, oz, ep...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, there, family, little, boy, jake, ...",negative
4,"[petter, matteis, love, time, money, visually,...",positive
5,"[probably, alltime, favorite, movie, story, se...",positive
6,"[sure, would, like, see, resurrection, dated, ...",positive
7,"[show, amazing, fresh, innovative, idea, 70, f...",negative
8,"[encouraged, positive, comment, film, looking,...",negative
9,"[like, original, gut, wrenching, laughter, lik...",positive


### Text Representation


#### 1) BOW

In [338]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [330]:
from sklearn.feature_extraction.text import CountVectorizer

In [320]:
data4 = data3.copy()

In [328]:
X_train, X_test, y_train, y_test = train_test_split(data4['review'], data4['sentiment'], test_size = 0.2)

In [332]:
#Count vectorizer for bag of words
cv=CountVectorizer()
#transformed train reviews
cv_train_reviews=cv.fit_transform(X_train)
#transformed test reviews
cv_test_reviews=cv.transform(X_test)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (40000, 181274)
BOW_cv_test: (10000, 181274)


In [342]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,y_train)
print(lr_bow)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [346]:
lr_bow.score(cv_test_reviews, y_test)

0.8858

In [354]:
confusion_matrix(y_test, lr.predict(cv_test_reviews))

array([[4469,  573],
       [ 569, 4389]], dtype=int64)

#### 2) tf-idf

In [357]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [359]:
tv=TfidfVectorizer()

tv_train_reviews=tv.fit_transform(X_train)

tv_test_reviews=tv.transform(X_test)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 181274)
Tfidf_test: (10000, 181274)


In [365]:
lr_tfidf = lr.fit(tv_train_reviews,y_train)

In [367]:
lr_tfidf.score(tv_test_reviews, y_test)

0.8944

In [369]:
confusion_matrix(y_test, lr_tfidf.predict(cv_test_reviews))

array([[4232,  810],
       [ 439, 4519]], dtype=int64)

#### 3) n-grams

In [384]:
cv=CountVectorizer(binary=False,ngram_range=(1,3))
cv_train_reviews=cv.fit_transform(X_train)
cv_test_reviews=cv.transform(X_test)

In [385]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

lr_bow=lr.fit(cv_train_reviews,y_train)

In [386]:
lr_bow.score(cv_test_reviews, y_test)

0.9011

In [390]:
tv=TfidfVectorizer(use_idf=True,ngram_range=(1,3))

tv_train_reviews=tv.fit_transform(X_train)

tv_test_reviews=tv.transform(X_test)

In [391]:
lr_tfidf = lr.fit(tv_train_reviews,y_train)

In [392]:
lr_tfidf.score(tv_test_reviews, y_test)

0.8843