In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer

In [2]:
train_df = pd.read_csv('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/train_data.csv',
                       sep='\t')

In [3]:
train_df.head()

Unnamed: 0,reviews,sentiments
0,For a movie that gets no respect there sure ar...,positive
1,Bizarre horror movie filled with famous faces ...,positive
2,"A solid, if unremarkable film. Matthau, as Ein...",positive
3,It's a strange feeling to sit alone in a theat...,positive
4,"You probably all already know this by now, but...",positive


In [4]:
#Shuffle datafram

train_df = train_df.sample(frac=1).reset_index(drop=True)

In [5]:
train_df

Unnamed: 0,reviews,sentiments
0,I hired out Hybrid on the weekend. What a disa...,negative
1,This could be a cute movie for kids My grandso...,negative
2,This is the fifth von Trier film I have seen. ...,positive
3,"When ""Good Times"" premiered in 1974, it was on...",positive
4,I took a flyer in renting this movie but I got...,positive
...,...,...
24995,"Yes, you guessed it. Another movie where ident...",negative
24996,This film takes you on one family's impossible...,positive
24997,It seems a shame that Greta Garbo ended her il...,negative
24998,"This movie was made in 1948, but it still ring...",positive


In [6]:
test_df = pd.read_csv('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/test_data.csv', 
                        sep='\t')

In [7]:
#Shuffle datafram

test_df  = test_df .sample(frac=1).reset_index(drop=True)

In [8]:
test_df

Unnamed: 0,reviews,sentiments
0,Although it doesn't seem very promising for a ...,positive
1,Watched Uzumaki last night and right away was ...,positive
2,"For me, the best & most memorable movies are o...",positive
3,This is the best movie I've ever seen. And I'v...,positive
4,"At one point in this waste of celluloid, Charl...",negative
...,...,...
24995,"Okay wait let me get this street, there are ac...",negative
24996,"This game requires stealth, smart, and a stead...",positive
24997,"This show was a really good one in many ways, ...",positive
24998,On the cusp of being insufferable. Somehow I s...,negative


In [9]:
all_data = pd.concat([train_df,test_df], axis=0)

In [10]:
all_data.isna().sum()

reviews       0
sentiments    0
dtype: int64

In [11]:
all_data.sentiments.value_counts()

negative    25000
positive    25000
Name: sentiments, dtype: int64

### Cleaning Reviews (text) Data

In [12]:
## function to remove html tags from text

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [13]:
## function to remove special charaters from text

def clean_text(text):
    regex = re.compile('[^a-zA-Z]')
    return regex.sub(' ', text)

In [14]:
## function to remove noise from text
def denoise_text(text):
    text = remove_html_tags(text)
    text = clean_text(text)
    text = text.lower()
    return text

In [15]:
all_data['reviews']=all_data['reviews'].apply(denoise_text)
#test_df['reviews']=test_df['reviews'].apply(denoise_text)

### Stemming

In [16]:
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [17]:
all_data['reviews']=all_data['reviews'].apply(simple_stemmer)

In [18]:
all_data.head()

Unnamed: 0,reviews,sentiments
0,i hire out hybrid on the weekend what a disapp...,negative
1,thi could be a cute movi for kid my grandson w...,negative
2,thi is the fifth von trier film i have seen i ...,positive
3,when good time premier in it wa one the first ...,positive
4,i took a flyer in rent thi movi but i gotta sa...,positive


### Remove Stopwords

In [19]:
stop=set(stopwords.words('english'))
stopword_list=nltk.corpus.stopwords.words('english')

In [20]:
#removing the stopwords
def remove_stopwords(text):
    
    tokenizer=ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [21]:
all_data['reviews']=all_data['reviews'].apply(remove_stopwords)

In [22]:
all_data.shape

(50000, 2)

In [23]:
lables = all_data.sentiments[0:]

In [24]:
lables.shape

(50000,)

In [25]:
train_data = all_data.reviews[:40000]
test_data = all_data.reviews[40000:]

In [30]:
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
cv_train_reviews=cv.fit_transform(train_data)
cv_test_reviews=cv.transform(test_data)
print(cv_train_reviews.shape)
print(cv_test_reviews.shape)

(40000, 5859968)
(10000, 5859968)


In [31]:
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
tv_train_reviews=tv.fit_transform(train_data)
tv_test_reviews=tv.transform(test_data)
print(tv_train_reviews.shape)
print(tv_test_reviews.shape)

(40000, 5859968)
(10000, 5859968)


In [32]:
lb=LabelBinarizer()

sentiment_labels=lb.fit_transform(all_data['sentiments'])
print(sentiment_labels.shape)

(50000, 1)


In [34]:
train_lb = sentiment_labels[:40000]
test_lb = sentiment_labels[40000:]

### Building a Classifier  - Logistic Regression

In [35]:
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

In [36]:
lr_bow=lr.fit(cv_train_reviews,np.ravel(train_lb))

In [37]:
lr_tfidf=lr.fit(tv_train_reviews,np.ravel(train_lb))

### Predictions

In [38]:
lr_bow_pred = lr_bow.predict(cv_test_reviews)

In [39]:
lr_tfidf_pred = lr_tfidf.predict(tv_test_reviews)

In [40]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

### Accuracy Score for Bag of words features

In [46]:
lr_bow_score=accuracy_score(test_lb,lr_bow_pred)
print("lr_bow_score :",lr_bow_score)

lr_bow_score : 0.7455


### Accuracy Score for tfidf features

In [47]:
lr_tfidf_score=accuracy_score(test_lb,lr_tfidf_pred)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7431


### Classification Report for Bag of words features

In [49]:
lr_bow_report=classification_report(test_lb,lr_bow_pred,target_names=['Positive','Negative'])
print(lr_bow_report)

              precision    recall  f1-score   support

    Positive       0.74      0.76      0.75      4963
    Negative       0.75      0.73      0.74      5037

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



### Classification Report for  tfidf features

In [50]:
lr_tfidf_report=classification_report(test_lb,lr_tfidf_pred,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.72      0.79      0.75      4963
    Negative       0.77      0.70      0.73      5037

    accuracy                           0.74     10000
   macro avg       0.75      0.74      0.74     10000
weighted avg       0.75      0.74      0.74     10000

