# **Importing Basic Library**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

# **Importing Dataset**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pwd

/content


In [5]:
!ls

drive  sample_data


In [14]:
!ls drive/MyDrive/dataset/FlipkartReviewDataset.csv

drive/MyDrive/dataset/FlipkartReviewDataset.csv


In [15]:
# Read the CSV file again, skipping malformed rows
df = pd.read_csv('/content/drive/MyDrive/dataset/FlipkartReviewDataset.csv')

# Print the DataFrame
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


# **DATA PERPROCESSING**

Indexing & selecting

In [16]:
# view first five rows of dataset
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [17]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [18]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [19]:
df.loc[1:,['review']]

Unnamed: 0,review
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
5,"Probably my all-time favorite movie, a story o..."
...,...
49995,I thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...


In [20]:
df.loc[1:,['sentiment']]

Unnamed: 0,sentiment
1,positive
2,positive
3,negative
4,positive
5,positive
...,...
49995,positive
49996,negative
49997,negative
49998,negative


Attributes of Data

In [21]:
# In case of deep copy, a copy(deep = true) copied in the other object with no reference on the original i.e, any changes made to a copy of
# object will not be reflected in the original object
df_1 = df.copy(deep= True)

In [22]:
# To get the index(row label) of the dataframes
df_1.index

RangeIndex(start=0, stop=50000, step=1)

In [23]:
#To get column labels of the dataframes
df_1.columns

Index(['review', 'sentiment'], dtype='object')

In [24]:
df_1.shape

(50000, 2)

In [25]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [26]:
df_1.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49718,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [27]:
df_1.dtypes

Unnamed: 0,0
review,object
sentiment,object


# **Data Cleaning**

In [28]:
# finding missing values

missing_values = df_1.isnull().sum()
print(missing_values)

review       0
sentiment    0
dtype: int64


In [29]:
# lower casting the data
lower_case = df_1['review'].str.lower()
lower_case

Removing Punctuation

In [30]:
# Removing the punctuation
import string as st
remove_punctuation = st.punctuation
print(remove_punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [31]:
# removing puntuation using translate()

df_1['review'] = df_1['review'].str.translate(str.maketrans('', '', remove_punctuation))
df_1['review']

In [32]:
# Remove comma from a String

df_1['review'] = df_1['review'].str.replace(',', '')
df_1['review']

In [33]:
df_1

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,Bad plot bad dialogue bad acting idiotic direc...,negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,Im going to have to disagree with the previous...,negative


Stop Words Cleaning

In [34]:
from nltk.corpus import stopwords

nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [36]:
from nltk.tokenize import word_tokenize
tokenized_words = word_tokenize(df_1['review'][0])
print(tokenized_words)

['One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'Oz', 'episode', 'youll', 'be', 'hooked', 'They', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'mebr', 'br', 'The', 'first', 'thing', 'that', 'struck', 'me', 'about', 'Oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'GO', 'Trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', 'This', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'Its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'wordI', 'would', 'say', 'the', 'main', 'appeal', 'of', 'the', 'show', 'is', 'due', 'to', 'the', 'fact', 'that', 'it', 'goes', 'where', 'other', 'shows', 'wouldnt', 'dare', 'Forget', 'pretty', 'pictures', 'painted', 'for', 'mainstream', 'audiences', 'forget', 'charm', 'forget', 'romance

In [43]:
import string
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stemmer = PorterStemmer()


stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
df_1["sentiment"] = df_1["sentiment"].apply(clean)

In [45]:
ratings = df_1["sentiment"].value_counts()
numbers = ratings.index
quantity = ratings.values

import plotly.express as px
figure = px.pie(df,
             values=quantity,
             names=numbers,hole = 0.5)
figure.show()

# **Word  Cloud**

In [50]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
df_1["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in df_1["sentiment"]]
df_1["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in df_1["sentiment"]]
data = df_1[[ "sentiment", "Positive", "Negative"]]
print(data.head())

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


  sentiment  Positive  Negative
0     posit       0.0       0.0
1     posit       0.0       0.0
2     posit       0.0       0.0
3       neg       0.0       0.0
4     posit       0.0       0.0


In [52]:
x = sum(df_1["Positive"])
y = sum(df_1["Negative"])

def sentiment_score(a, b):
    if (a>b) :
        print("Positive 😊 ")
    else:
        print("Negative 🙂 ")
sentiment_score(x, y)

Negative 🙂 


In [53]:
print("Positive: ", x)
print("Negative: ", y)

Positive:  0.0
Negative:  0.0
