<a href="https://colab.research.google.com/github/nsy2nv/model/blob/master/sentiment_analysis_IMDBdataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

In [2]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup

In [3]:
import pandas as pd
import numpy as np


In [4]:
data = pd.read_csv('/content/IMDB Dataset.csv')


In [5]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [6]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [7]:
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [8]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [9]:
data.groupby('sentiment').count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


**Text Normalization**

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
#Tokenization of text
tokenizers=ToktokTokenizer()
#Setting English stopwords
stopwords=nltk.corpus.stopwords.words('english')

In [12]:
#Cleaning off the dirty text and characters
def clean_text(text):
  soup = BeautifulSoup(text, "html.parser")
  text = soup.get_text() # Extract only the text content of the html without the tags
  text = re.sub('\[[^]]*\]', '', text) #Removes the square brackets and the texts within the brackets
  return text


In [13]:
#Apply function on review column
data['review']=data['review'].apply(clean_text)

  soup = BeautifulSoup(text, "html.parser")


In [14]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


**Stemming the Text**

In [15]:
#Stemming of the text data
def my_stem(text):
    portStem=nltk.porter.PorterStemmer()
    text= ' '.join([portStem.stem(word) for word in text.split()])
    return text

In [16]:
data['review'] = data['review'].apply(my_stem)

In [17]:
data.head(10)

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive
5,"probabl my all-tim favorit movie, a stori of s...",positive
6,i sure would like to see a resurrect of a up d...,positive
7,"thi show wa an amazing, fresh & innov idea in ...",negative
8,encourag by the posit comment about thi film o...,negative
9,if you like origin gut wrench laughter you wil...,positive


**Removal of Stopwords**

In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [19]:
# set stopwords to english

stop_words = set(stopwords.words('english'))

print(stop_words)

{'ve', "wasn't", 'or', 'isn', 'to', 'will', 'were', 'having', 'then', 'can', 'by', 'over', 'an', "shan't", 'out', 'while', 'doesn', 'them', 'won', 're', 'that', 'into', "haven't", 'nor', "couldn't", 'we', 'during', 'both', 'its', 'under', 'once', 'ma', 'so', "that'll", 'again', 'am', 'in', 'being', 'herself', 'been', 'he', 'weren', 'of', 'these', "hadn't", 'as', 'themselves', 'any', "hasn't", 'below', 'own', 'his', "you're", 'our', 'your', 'each', 'and', 'for', "you've", 'until', 'was', "isn't", 'through', 'which', "wouldn't", 'it', 'who', 'him', 'there', 'here', 'some', "didn't", 'whom', 'further', 'ourselves', 'm', 'why', 'i', 'ours', 'aren', 'above', 'are', 'couldn', 'with', 'same', 'did', 'is', 'does', 'a', 'only', 'haven', 'itself', 'needn', 'had', 'her', "weren't", 'off', 'because', 'shan', 'y', 'just', 'll', 'other', 'what', 'hadn', "mightn't", 'few', 'if', 'the', 'before', 'doing', 'at', 'this', 'such', 'theirs', 'yourselves', 'when', 'no', "mustn't", 'myself', 'mustn', 'all', 

In [20]:
#removing the stopwords
def removing_stopwords(text, is_lower_case=False):
    #Tokenization of text
    tokenizers=ToktokTokenizer()
    #Setting English stopwords
    tokens = tokenizers.tokenize(text) #text is broken into tokens
    tokens = [token.strip() for token in tokens] #list of tokens is stripped off of any leading or trailing whitespace
    if is_lower_case:
        filter_tokens = [token for token in tokens if token not in stop_words]
    else:
        filter_tokens = [token for token in tokens if token.lower() not in stop_words]
    filtered_text = ' '.join(filter_tokens)
    return filtered_text

In [21]:
# Applying removing_stopwords function on review column

data['review'] = data['review'].apply(removing_stopwords)

In [22]:
data.head(10)

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive
5,"probabl all-tim favorit movie , stori selfless...",positive
6,sure would like see resurrect date seahunt ser...,positive
7,"thi show wa amazing , fresh &amp; innov idea 7...",negative
8,encourag posit comment thi film wa look forwar...,negative
9,like origin gut wrench laughter like thi movie...,positive


**Train Test Split**

In [None]:
pip install scikit-learn

In [24]:
from sklearn.model_selection import train_test_split

X,y = data['review'], data['sentiment']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [26]:

#train dataset

#train_reviews_data=data.review[:30000]


In [27]:
#test dataset

#test_reviews_data=data.review[30000:]

**Bag of words**

In [28]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train=cv.fit_transform(X_train)
#transformed test reviews
cv_test=cv.transform(X_test)

print('BOW_cv_train:',cv_train.shape)
print('BOW_cv_test:',cv_test.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (35000, 5662131)
BOW_cv_test: (15000, 5662131)


**TF-IDF**

In [29]:
#Tfidf vectorizer
tf=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tf_train=tf.fit_transform(X_train)
#transformed test reviews
tf_test=tf.transform(X_test)
print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (35000, 5662131)
Tfidf_test: (15000, 5662131)


**Label Encoding**

In [30]:
#labeling the sentient data
label=LabelBinarizer()
#transformed sentiment data
sentiment_data=label.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [31]:
#train_data=data.sentiment[:30000]
#test_data=data.sentiment[30000:]

In [32]:
#training the model
logistic=LogisticRegression(penalty='l2',max_iter=100,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=logistic.fit(cv_train,y_train)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=logistic.fit(tf_train,y_train)
print(lr_tfidf)

LogisticRegression(C=1, random_state=42)
LogisticRegression(C=1, random_state=42)


In [33]:
#Predicting the model for bag of words
lr_bow_predict=logistic.predict(cv_test)
print(lr_bow_predict)

['positive' 'negative' 'negative' ... 'positive' 'negative' 'positive']


In [34]:
##Predicting the model for tfidf features
lr_tfidf_predict=logistic.predict(tf_test)
print(lr_tfidf_predict)

['positive' 'negative' 'negative' ... 'positive' 'negative' 'negative']


In [35]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

lr_bow_score : 0.7416666666666667


In [36]:
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7402
