In [1]:
# import the libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [2]:
#read data from csv file 
df_sentiment = pd.read_csv('imdb_labelled.txt', sep = '\t', names = ['comment', 'label'])

In [3]:
df_sentiment.head()

Unnamed: 0,comment,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
# df_sentiment statistics describe method
df_sentiment.describe()

Unnamed: 0,label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [5]:
# info about dataframe using .info method
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  748 non-null    object
 1   label    748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [6]:
df_sentiment.groupby('label').describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,Definitely worth checking out.,2


In [7]:
#view data statistics using describe()
df_sentiment.describe()

Unnamed: 0,label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [8]:
#view columns of the dataset
df_sentiment.columns

Index(['comment', 'label'], dtype='object')

In [9]:
df_sentiment.tail()

Unnamed: 0,comment,label
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0
747,All in all its an insult to one's intelligence...,0


In [10]:
#Count number of records
df_sentiment.shape

(748, 2)

In [11]:
#view datatypes
type(df_sentiment)

pandas.core.frame.DataFrame

In [12]:
# length of comments
df_sentiment ['comment_length'] = df_sentiment['comment'].apply(len)

In [13]:
# view df_sentiment with length of comment
df_sentiment.head()

Unnamed: 0,comment,label,comment_length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108


In [14]:
# view length > 50 .iloc method
df_sentiment[df_sentiment['comment_length'] > 50]['comment'].iloc[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [15]:
# process data using vectorization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [16]:
# def function for remove punctuation, stopwords in our dataset
def comment_process(cmt):
    
    # check char for punctuation
    no_punctuation = [char for char in cmt if char not in string.punctuation]
    
    # join the sentence
    no_punctuation = ''.join(no_punctuation)
    
    # remove stopwords
    no_stopwords = [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]
    
    # return no_punctuation, no_stopwords
    
    return no_stopwords

In [17]:
# check function is working or not.
comment_process("Hi! My Project is ongoing and I will done very well. ")

['Hi', 'Project', 'ongoing', 'done', 'well']

In [18]:
# check function is working or not.
comment_process("Hi! My Project is ongoing and I will done very well. ")

['Hi', 'Project', 'ongoing', 'done', 'well']

In [19]:
# bag of words and apply function on comment feature
bag_of_words = CountVectorizer(analyzer=comment_process)

In [20]:
# let's bag of words fit and transform on comment feature
bag_of_words.fit(df_sentiment['comment'])
comment_bag_of_words = bag_of_words.transform(df_sentiment['comment'])

In [21]:
# create tfidf transformer 
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

In [22]:
#apply tfidf transformer and fit and transform the bag of words into it (transformed version)
tfidf.fit(comment_bag_of_words)
comment_tfidf = tfidf.transform(comment_bag_of_words)

In [23]:
# print tfidt comment shape
comment_tfidf.shape

(748, 3259)

In [24]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
sentiment_detect_model = MultinomialNB()
sentiment_detect_model.fit(comment_tfidf, df_sentiment['label'])

MultinomialNB()

In [25]:
#check model for the predicted and expected value say for comment#1 and comment#5
comment = df_sentiment['comment'][9]
bag_of_words_comments = bag_of_words.transform([comment])
tfidf_comment = tfidf.transform(bag_of_words_comments)

In [26]:
print('Actual or Expected Comment', df_sentiment.label[9])
print('Pridicted Comment', sentiment_detect_model.predict(tfidf_comment)[0])

Actual or Expected Comment 1
Pridicted Comment 1


In [27]:
df_sentiment.head(10)

Unnamed: 0,comment,label,comment_length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108
5,"The rest of the movie lacks art, charm, meanin...",0,114
6,Wasted two hours.,0,19
7,Saw the movie today and thought it was a good ...,1,79
8,A bit predictable.,0,20
9,Loved the casting of Jimmy Buffet as the scien...,1,59
