In [1]:
!pip install wordcloud



In [2]:
import numpy as np
import pandas as pd

import nltk
from nltk import corpus, tokenize
from nltk.corpus import stopwords

import re

from nltk.stem import PorterStemmer, WordNetLemmatizer, porter

from wordcloud import WordCloud, STOPWORDS

import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load Review Data

In [3]:
hs_reviewData = pd.read_csv("hotstar_reviews.csv")
hs_reviewData.head()

Unnamed: 0,ID,UserName,Created_Date,Reviews,Lower_Case_Reviews,Sentiment_Manual_BP,Sentiment_Manual,Review_Length,DataSource,Year,Month,Date,Sentiment_Polarity
0,1,,08-10-2017,Hh,hh,Negative,Negative,2,Google_PlayStore,2017,8,10,Neutral
1,2,,08-11-2017,No,no,Negative,Negative,2,Google_PlayStore,2017,8,11,Neutral
2,3,asadynwa,08-12-2017,@hotstar_helps during paymnt for premium subsc...,@hotstar_helps during paymnt for premium subsc...,Help,Negative,140,Twitter,2017,8,12,Negative
3,4,jineshroxx,08-11-2017,@hotstartweets I am currently on Jio network a...,@hotstartweets i am currently on jio network a...,Help,Negative,140,Twitter,2017,8,11,Positive
4,5,YaminiSachar,08-05-2017,@hotstartweets the episodes of Sarabhai vs Sar...,@hotstartweets the episodes of sarabhai vs sar...,Help,Negative,140,Twitter,2017,8,5,Neutral


In [4]:
hs_reviewData["Lower_Case_Reviews"]

0                                                      hh
1                                                      no
2       @hotstar_helps during paymnt for premium subsc...
3       @hotstartweets i am currently on jio network a...
4       @hotstartweets the episodes of sarabhai vs sar...
                              ...                        
5048    i loathe this application in so many levels fo...
5049    1 cannot keep track of progress if you watch v...
5050    1 miles below experience like amazon prime 2 e...
5051    you pay for premium your internet speed is at ...
5052    very very frustrating navigation this applicat...
Name: Lower_Case_Reviews, Length: 5053, dtype: object

In [5]:
hs_reviewData.shape

(5053, 13)

In [6]:
hs_reviewData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5053 entries, 0 to 5052
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   5053 non-null   int64 
 1   UserName             4331 non-null   object
 2   Created_Date         5053 non-null   object
 3   Reviews              5053 non-null   object
 4   Lower_Case_Reviews   5053 non-null   object
 5   Sentiment_Manual_BP  5053 non-null   object
 6   Sentiment_Manual     5053 non-null   object
 7   Review_Length        5053 non-null   int64 
 8   DataSource           5053 non-null   object
 9   Year                 5053 non-null   int64 
 10  Month                5053 non-null   int64 
 11  Date                 5053 non-null   int64 
 12  Sentiment_Polarity   5053 non-null   object
dtypes: int64(5), object(8)
memory usage: 513.3+ KB


In [7]:
hs_reviewData.Sentiment_Manual.value_counts()

Neutral     1738
Positive    1733
Negative    1582
Name: Sentiment_Manual, dtype: int64

In [8]:
hs_reviewData.Sentiment_Manual.value_counts()/hs_reviewData.Sentiment_Manual.size

Neutral     0.343954
Positive    0.342965
Negative    0.313081
Name: Sentiment_Manual, dtype: float64

In [9]:
hs_reviewData.DataSource.value_counts()/hs_reviewData.DataSource.size

Twitter             0.559272
Google_PlayStore    0.440728
Name: DataSource, dtype: float64

In [10]:
hs_reviewData.DataSource.size

5053

In [11]:
rdPivot = pd.pivot_table(hs_reviewData,
    index='Sentiment_Manual',
    columns='DataSource',
    values='ID',aggfunc='count')/hs_reviewData.DataSource.size

In [12]:
rdPivot

DataSource,Google_PlayStore,Twitter
Sentiment_Manual,Unnamed: 1_level_1,Unnamed: 2_level_1
Negative,0.12923,0.183851
Neutral,0.051257,0.292697
Positive,0.260241,0.082723


In [13]:
rdPivot['Total'] = rdPivot['Google_PlayStore']+rdPivot['Twitter']

In [14]:
rdPivot

DataSource,Google_PlayStore,Twitter,Total
Sentiment_Manual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,0.12923,0.183851,0.313081
Neutral,0.051257,0.292697,0.343954
Positive,0.260241,0.082723,0.342965


In [15]:
hs_reviewData.head()

Unnamed: 0,ID,UserName,Created_Date,Reviews,Lower_Case_Reviews,Sentiment_Manual_BP,Sentiment_Manual,Review_Length,DataSource,Year,Month,Date,Sentiment_Polarity
0,1,,08-10-2017,Hh,hh,Negative,Negative,2,Google_PlayStore,2017,8,10,Neutral
1,2,,08-11-2017,No,no,Negative,Negative,2,Google_PlayStore,2017,8,11,Neutral
2,3,asadynwa,08-12-2017,@hotstar_helps during paymnt for premium subsc...,@hotstar_helps during paymnt for premium subsc...,Help,Negative,140,Twitter,2017,8,12,Negative
3,4,jineshroxx,08-11-2017,@hotstartweets I am currently on Jio network a...,@hotstartweets i am currently on jio network a...,Help,Negative,140,Twitter,2017,8,11,Positive
4,5,YaminiSachar,08-05-2017,@hotstartweets the episodes of Sarabhai vs Sar...,@hotstartweets the episodes of sarabhai vs sar...,Help,Negative,140,Twitter,2017,8,5,Neutral


# Data Cleansing

In [16]:
reviewData_clean1 = hs_reviewData[['Lower_Case_Reviews','DataSource','Sentiment_Manual']]

In [17]:
reviewData_clean1.columns = ['Reviews','Source','Sentiment']

In [18]:
reviewData_clean1

Unnamed: 0,Reviews,Source,Sentiment
0,hh,Google_PlayStore,Negative
1,no,Google_PlayStore,Negative
2,@hotstar_helps during paymnt for premium subsc...,Twitter,Negative
3,@hotstartweets i am currently on jio network a...,Twitter,Negative
4,@hotstartweets the episodes of sarabhai vs sar...,Twitter,Negative
...,...,...,...
5048,i loathe this application in so many levels fo...,Google_PlayStore,Negative
5049,1 cannot keep track of progress if you watch v...,Google_PlayStore,Negative
5050,1 miles below experience like amazon prime 2 e...,Google_PlayStore,Negative
5051,you pay for premium your internet speed is at ...,Google_PlayStore,Negative


In [19]:
reviewData_clean1 = pd.get_dummies(reviewData_clean1,columns=['Source'],drop_first=True)

In [20]:
reviewData_clean1

Unnamed: 0,Reviews,Sentiment,Source_Twitter
0,hh,Negative,0
1,no,Negative,0
2,@hotstar_helps during paymnt for premium subsc...,Negative,1
3,@hotstartweets i am currently on jio network a...,Negative,1
4,@hotstartweets the episodes of sarabhai vs sar...,Negative,1
...,...,...,...
5048,i loathe this application in so many levels fo...,Negative,0
5049,1 cannot keep track of progress if you watch v...,Negative,0
5050,1 miles below experience like amazon prime 2 e...,Negative,0
5051,you pay for premium your internet speed is at ...,Negative,0


In [21]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
punctuation = list(string.punctuation)
punctuation

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [23]:
#reviewData_clean2 = [re.sub(pattern='""[^a-zA-Z0-9]+[\s*]""', repl='', string=text) for text in reviewData_clean1.Reviews.map(str).values]  

In [24]:
#reviewData_clean2

In [25]:
re_pattern = r'@[a-zA-Z0-9_:]+|b[\'"]rt|[\d]+[a-zA-Z_+=\'?]+[\d]+|[a-zA-Z_*+-=]+[\d]+[a-zA-Z_*+-=]+[\d]+|https:+[a-zA-Z0-9/._+-=]+[&amp;]|rt|https:+[a-zA-Z0-9/._+-=]+[&amp;]|rt'

In [26]:
re_pattern

'@[a-zA-Z0-9_:]+|b[\\\'"]rt|[\\d]+[a-zA-Z_+=\\\'?]+[\\d]+|[a-zA-Z_*+-=]+[\\d]+[a-zA-Z_*+-=]+[\\d]+|https:+[a-zA-Z0-9/._+-=]+[&amp;]|rt|https:+[a-zA-Z0-9/._+-=]+[&amp;]|rt'

In [27]:
reviewData_clean2 = [re.sub(pattern=re_pattern, repl='', string=text) for text in reviewData_clean1.Reviews.map(str).values]  

In [28]:
reviewData_clean2[3]

' i am currently on jio network and would like to know whether i will be able to watch epl telecasted on star spos select hd1'

In [29]:
reviewData_clean2

['hh',
 'no',
 ' during paymnt for premium subscription the transaction failed twice but i have not received refund for one of the transaction',
 ' i am currently on jio network and would like to know whether i will be able to watch epl telecasted on star spos select hd1',
 ' the episodes of sarabhai vs sarabhai season 1 are not downloadable m not able to watch them offline please do smthng about it',
 ' not able 2 watch the latest episode of got on the app doesn t allow to take the screenshot of the error help to resolve asap',
 'please allow rupay or maestro payment gateways for premium membership i mean paytm works but thru debit cards would be great ',
 ' why today s epi of #lovekahaiintezaar nt available on available now it was in d morning now showing nt available due expiry',
 ' #hotstarfraud i paid for the subscription before 30 july havent received any cashback which was 100 specified for hdfc card',
 ' i have a premium accnt at hotstar but now it is showing tht i m not a prem

In [30]:
reviewData_clean3 = []

for review in reviewData_clean2:
    nltkStWords_free = " ".join([txt for txt in review.lower().split() if txt not in stop_words])
    wordcloudStWords_free = " ".join([txt for txt in nltkStWords_free.lower().split() if txt not in STOPWORDS])
    punc_free = " ".join([txt for txt in wordcloudStWords_free.lower().split() if txt not in punctuation])
    
    hashFree = punc_free.replace("#", "")
    
    reviewData_clean3.append(hashFree)

In [31]:
reviewData_clean3

['hh',
 '',
 'paymnt premium subscription transaction failed twice received refund one transaction',
 'currently jio network know whether able watch epl telecasted star spos select hd1',
 'episodes sarabhai vs sarabhai season 1 downloadable able watch offline please smthng',
 'able 2 watch latest episode got app allow take screenshot error help resolve asap',
 'please allow rupay maestro payment gateways premium membership mean paytm works thru debit cards great',
 'today epi lovekahaiintezaar nt available available morning showing nt available due expiry',
 'hotstarfraud paid subscription 30 july havent received cashback 100 specified hdfc card',
 'premium accnt hotstar showing tht premium member u pls chk',
 'seeing blank page terms amp conditions hdfc bank 100 cashback offer hotstar premium membership please help',
 'sir please allow us download videos ur app present option allow us dwnld mre videos due ltd space',
 'hi pl tab spos homepage isl bundesliga search team name stream pls

In [32]:
#remove unnecessary words and lemmatise

!pip install spacy
!python -m spacy download en_core_web_sm

nltk.download('wordnet')
nltk.download('omw-1.4')
import spacy

nlpSpcy = spacy.load("en_core_web_sm")  # Load English language model

# Create list of unimportant POS tags
unimportant_pos = ["DET", "CCONJ", "ADP", "PUNCT", "SPACE", "INTJ", "AUX"]

wd = WordNetLemmatizer()
reviewData_clean4 = []

for rev in reviewData_clean3:
    # Filter out unimportant words
    filtered_words = [token.text for token in nlpSpcy(rev) if token.pos_ not in unimportant_pos]

    # Join filtered words into a new string
    filtered_text = " ".join(filtered_words)

    cleanData = " ".join(wd.lemmatize(word) for word in filtered_text.split())
    reviewData_clean4.append(cleanData)

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [33]:
reviewData_clean4

['hh',
 '',
 'paymnt premium subscription transaction failed twice received refund one transaction',
 'currently jio network know whether able watch epl telecasted star spos select hd1',
 'episode sarabhai sarabhai season 1 downloadable able watch offline smthng',
 'able 2 watch latest episode got app allow take screenshot error help resolve asap',
 'allow rupay maestro payment gateway premium membership mean paytm work debit card great',
 'today epi lovekahaiintezaar nt available available morning showing nt available due expiry',
 'hotstarfraud paid subscription 30 july nt received cashback 100 specified hdfc card',
 'premium accnt hotstar showing tht premium member u pls chk',
 'seeing blank page term amp condition hdfc bank 100 cashback offer hotstar premium membership help',
 'sir allow u download video ur app present option allow u dwnld mre video due ltd space',
 'pl tab spos homepage isl bundesliga search team name stream look',
 'unable watch star spos select hd1 live hotstar 

In [34]:
reviewData_clean4[11:13]

['sir allow u download video ur app present option allow u dwnld mre video due ltd space',
 'pl tab spos homepage isl bundesliga search team name stream look']

In [35]:
reviewData_clean1['CleanReview'] = reviewData_clean4
reviewData_clean1

Unnamed: 0,Reviews,Sentiment,Source_Twitter,CleanReview
0,hh,Negative,0,hh
1,no,Negative,0,
2,@hotstar_helps during paymnt for premium subsc...,Negative,1,paymnt premium subscription transaction failed...
3,@hotstartweets i am currently on jio network a...,Negative,1,currently jio network know whether able watch ...
4,@hotstartweets the episodes of sarabhai vs sar...,Negative,1,episode sarabhai sarabhai season 1 downloadabl...
...,...,...,...,...
5048,i loathe this application in so many levels fo...,Negative,0,loathe application many level forget clunky wo...
5049,1 cannot keep track of progress if you watch v...,Negative,0,1 keep track progress watch video different de...
5050,1 miles below experience like amazon prime 2 e...,Negative,0,1 mile experience amazon prime 2 even premium ...
5051,you pay for premium your internet speed is at ...,Negative,0,pay premium internet speed 20mbps hotstar app ...


# Split the data into train (80%) and test(20%) from sample data

In [36]:
x_train, x_test,y_train,y_test = train_test_split(reviewData_clean1.CleanReview,reviewData_clean1.Sentiment,test_size=0.2,
    random_state=101)

# Vectorize the text data using Count Vectorizer

In [38]:
vectorizer = CountVectorizer(min_df = 5).fit(x_train)
x_train_vector = vectorizer.transform(x_train)
x_test_vector = vectorizer.transform(x_test)

In [40]:
print(vectorizer.get_feature_names_out())

['05' '07' '08' '09' '10' '100' '11' '12' '15' '17' '199' '1st' '20' '24'
 '25' '26' '27' '28' '29' '2nd' '30' '3rd' '40' '4th' '5th' '720p' '999'
 'aap' 'aarambh' 'ab' 'able' 'absolutely' 'abusive' 'access' 'account'
 'action' 'actor' 'actual' 'actually' 'ad' 'add' 'added' 'adni'
 'advaysinghraizada' 'adveisement' 'agree' 'ahead' 'ahora' 'air' 'aired'
 'airing' 'allow' 'almost' 'already' 'always' 'amazing' 'amazon' 'america'
 'amla' 'amp' 'android' 'annoying' 'anymore' 'anyone' 'anything' 'anytime'
 'anywhere' 'ap' 'app' 'application' 'apps' 'arnav' 'arnavsays' 'arshi'
 'arslei' 'asap' 'ask' 'asks' 'atleast' 'aug' 'august' 'aur' 'available'
 'award' 'awesome' 'awsm' 'b4' 'back' 'bad' 'bakwas' 'bandhan' 'barun'
 'barunsobti' 'bb' 'bc' 'bcoz' 'beautiful' 'become' 'becomes' 'best'
 'better' 'bhai' 'bhi' 'big' 'bigg' 'biggboss' 'biggbosstamil'
 'biggbosstelugu' 'bit' 'blaming' 'blank' 'block' 'bol' 'bollywood' 'bolt'
 'bos' 'bought' 'box' 'boxtv' 'boycott' 'boycottcopypastegulkhan'
 'boyc

In [41]:
print(vectorizer.get_feature_names_out().size)

872


In [42]:
pd.DataFrame(x_train_vector.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,862,863,864,865,866,867,868,869,870,871
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4039,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
x_test_vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Multinomial Naive Bayes Theorem

In [44]:
multNB = MultinomialNB().fit(x_train_vector,y_train)
multNB

MultinomialNB()

In [45]:
multNB.score(x_train_vector,y_train),multNB.score(x_test_vector,y_test)

(0.7835230084116774, 0.7230464886251237)

In [46]:
multNB.classes_

array(['Negative', 'Neutral', 'Positive'], dtype='<U8')

In [47]:
predict_train = multNB.predict(x_train_vector)
predict_test = multNB.predict(x_test_vector)

In [50]:
def getNum(data):
    dataNew = pd.DataFrame(data, columns = ['Sentiment'])
    dataNew['y_label'] = np.where(dataNew.Sentiment == "Neutral",0,np.where(dataNew.Sentiment == "Positive", 1, -1))
    return dataNew

In [51]:
y_train_new = getNum(y_train)
y_test_new = getNum(y_test)
predict_train_new = getNum(predict_train)
predict_test_new = getNum(predict_test)

In [52]:
y_train_new.shape,predict_train_new.shape,y_test_new.shape,predict_test_new.shape

((4042, 2), (4042, 2), (1011, 2), (1011, 2))

In [53]:
print(classification_report(y_train_new['y_label'],predict_train_new['y_label']))

              precision    recall  f1-score   support

          -1       0.77      0.80      0.78      1259
           0       0.83      0.67      0.74      1388
           1       0.76      0.88      0.82      1395

    accuracy                           0.78      4042
   macro avg       0.79      0.78      0.78      4042
weighted avg       0.79      0.78      0.78      4042



In [55]:
print(classification_report(y_test_new['y_label'],predict_test_new['y_label']))

              precision    recall  f1-score   support

          -1       0.69      0.71      0.70       323
           0       0.78      0.62      0.69       350
           1       0.71      0.85      0.77       338

    accuracy                           0.72      1011
   macro avg       0.73      0.72      0.72      1011
weighted avg       0.73      0.72      0.72      1011

