In [1]:
# Import Required libraries
import pandas as pd

In [2]:
df_review = pd.read_csv('imdb_labelled.txt', sep='\t', names = ['Review', 'Label'])

In [3]:
df_review.head(10)

Unnamed: 0,Review,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [4]:
# view more information about the setiment data using describe method
df_review.describe()

Unnamed: 0,Label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [5]:
df_review.shape  # It has 748 reviews

(748, 2)

In [6]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  748 non-null    object
 1   Label   748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [7]:
# view data using group by and describe method
df_review.groupby('Label').describe()

Unnamed: 0_level_0,Review,Review,Review,Review
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,Definitely worth checking out.,2


In [8]:
df_review.groupby('Label').size()

Label
0    362
1    386
dtype: int64

In [9]:
#view first 
df_review['Review'].iloc[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [10]:
# Start text processing
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vect = CountVectorizer()

In [12]:
# define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    # Check characters to see if there are punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    # now form the sentence.
    no_punctuation = ''.join(no_punctuation)
    # Now eliminate any stopwords
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [13]:
# bag of words by applying the function and fit the data (comment) into it
import string
from nltk.corpus import stopwords
bag_of_words = CountVectorizer(analyzer=message_text_process).fit(df_review['Review'])

In [14]:
bag_of_words

CountVectorizer(analyzer=<function message_text_process at 0x0000000FF6C29948>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [15]:
# Apply Transform method
review_bagOfWords = bag_of_words.transform(df_review['Review'])

In [16]:
print(review_bagOfWords)

  (0, 808)	1
  (0, 1350)	1
  (0, 1377)	1
  (0, 2066)	1
  (0, 2153)	1
  (0, 2723)	1
  (0, 3250)	1
  (1, 889)	1
  (1, 1068)	1
  (1, 1608)	1
  (1, 1741)	1
  (1, 2037)	1
  (1, 2183)	1
  (1, 2862)	1
  (1, 3143)	1
  (2, 87)	1
  (2, 774)	1
  (2, 817)	1
  (2, 837)	1
  (2, 866)	1
  (2, 931)	1
  (2, 956)	1
  (2, 1021)	1
  (2, 1111)	1
  (2, 1334)	1
  :	:
  (743, 389)	1
  (743, 421)	1
  (743, 982)	1
  (743, 1119)	1
  (743, 1712)	1
  (743, 2883)	1
  (743, 3164)	1
  (744, 702)	1
  (744, 1587)	1
  (744, 2037)	1
  (744, 2413)	1
  (744, 2495)	1
  (744, 2610)	1
  (744, 3125)	1
  (744, 3215)	1
  (745, 1418)	1
  (745, 3212)	1
  (746, 254)	1
  (746, 911)	1
  (747, 1822)	1
  (747, 1891)	1
  (747, 1895)	1
  (747, 2135)	1
  (747, 2247)	1
  (747, 3158)	1


In [17]:
# apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(review_bagOfWords)

In [18]:
# print shape of the tfidf 
review_tfidf = tfidf_transformer.transform(review_bagOfWords)
review_tfidf.shape

(748, 3259)

In [19]:
#choose naive Bayes model to detect and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
sentiment_detection_model = MultinomialNB().fit(review_tfidf,df_review['Label'])

In [20]:
# check model for the predicted  and expected value say for comment# 1 and comment#5
review = df_review['Review'][4]
bag_of_words_for_review = bag_of_words.transform([review])
tfidf = tfidf_transformer.transform(bag_of_words_for_review)

print ('predicted sentiment label ', sentiment_detection_model.predict(tfidf)[0])
print ('expected sentiment label', df_review.Label[4])

predicted sentiment label  1
expected sentiment label 1


# Thank You