<a href="https://colab.research.google.com/github/mmilannaik/bostonhousepricing/blob/main/W31S3_sentiment_analysis_using_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'imdb-dataset-of-50k-movie-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F134715%2F320111%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240911%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240911T060623Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3c96a03abf168eaf217f615780167498edf08042a14de53190925feb034b609260d010b81f1aa77412ba8ce867c63dd73f91ad99d6e8b6f342bbbb1f5566acc3dd9ef489161cbe178ae73880063be13e7e89e7e3df099cefdfa12b69f1a0a218921fc0111db472a7692f67f8bc7cf0c6f1f4eeccdd8094163b67eacf4bf2a1b6ccccae84b0c7a5b530c97c9f60b5ca05a11cfa99bfb8fe7e2381fdc8d796e7b2a027efea6bafc5700d45af822ccb08583589a9117c04ba86830d29ab4377e4a46d74c4becf07009fa73f1ea9f86ab86a6959aa7e363b513bc572e1932d66da7a88b22fb55f45555120f694ed1f664e42389199103a21d714b3fbb5200cae35dc'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading imdb-dataset-of-50k-movie-reviews, 26962657 bytes compressed
Downloaded and uncompressed: imdb-dataset-of-50k-movie-reviews
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [3]:
df=pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# One review
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## Text Cleaning

1. Sample 10000 rows
2. Remove html tags
3. Remove special characters
4. Converting every thing to lower case
5. Removing Stop words
6. Stemming

In [8]:
df=df.sample(10000)

In [9]:
df.shape

(10000, 2)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 36425 to 28379
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [9]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [11]:
df.head()

Unnamed: 0,review,sentiment
36425,Got this the other day from the Creators on DV...,positive
2498,This film breeches the fine line between satir...,negative
32009,My comments on this movie have been deleted tw...,negative
16196,"This is a Black and White film from France,<br...",positive
25547,Two Hands is a highly enjoyable Aussie crime c...,positive


In [12]:
import re
clean = re.compile('<.*?>')
re.sub(clean, '', df.iloc[2].review)

"My comments on this movie have been deleted twice, which i find pretty offending, since i am making an effort to judge this movie for other people. Please be tolerant of other people's opinion. Obviously writing in the spirit of Nietzsches works is not understood, so ill change my comment completely.I think this is a really bad movie for several reasons.Subject: one should be very careful in making a movie about a philosopher that is even today not understood by the masses and amongst peers brings out passionate discussions. One thing philosophers do agree on is that Nietzsche was a great thinker. So making a movie about his life, which obviously includes his 'ideas' is a thing one should be extremely careful with, or preferably, don't do at all. Wisdom starts with knowing what you don't know. One might think this is not a review of the movie itself, but the movie is not about an imaginary character, it is about the life of someone who actually lived and had/has great influence on the

In [13]:
# Function to clean html tags
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [14]:
df['review']=df['review'].apply(clean_html)

In [15]:
# converting everything to lower

def convert_lower(text):
    return text.lower()

In [16]:
df['review']=df['review'].apply(convert_lower)

In [17]:
# function to remove special characters

def remove_special(text):
    x=''

    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x + ' '
    return x

In [18]:
remove_special(' th%e @ classic use of the word.it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare. forget pretty pictures painted for mainstream audiences, f')

' th e   classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary  it focuses mainly on emerald city  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  em city is home to many  aryans  muslims  gangstas  latinos  christians  italians  irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare  forget pretty pictures painted for mainstream audiences  f'

In [19]:
df['review']=df['review'].apply(remove_special)

In [20]:
# Remove the stop words
import nltk

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
from nltk.corpus import stopwords

In [29]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [30]:
df

Unnamed: 0,review,sentiment
36425,got this the other day from the creators on dv...,positive
2498,this film breeches the fine line between satir...,negative
32009,my comments on this movie have been deleted tw...,negative
16196,this is a black and white film from france sim...,positive
25547,two hands is a highly enjoyable aussie crime c...,positive
...,...,...
35870,i cannot stop saying how much i loved this mov...,positive
14425,why do they keep making trash like this becau...,negative
5456,this film s premise is so simple and obvious t...,negative
34090,the film transported everyone back to october ...,positive


In [31]:

def remove_stopwords(text):
    x=[]
    for i in text.split():

        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y

In [32]:
df['review']=df['review'].apply(remove_stopwords)

In [33]:
df

Unnamed: 0,review,sentiment
36425,"[got, day, creators, dvd, saw, advertised, fre...",positive
2498,"[film, breeches, fine, line, satire, silliness...",negative
32009,"[comments, movie, deleted, twice, find, pretty...",negative
16196,"[black, white, film, france, simple, plot, gan...",positive
25547,"[two, hands, highly, enjoyable, aussie, crime,...",positive
...,...,...
35870,"[cannot, stop, saying, much, loved, movie, mov...",positive
14425,"[keep, making, trash, like, makes, money, eras...",negative
5456,"[film, premise, simple, obvious, texas, millio...",negative
34090,"[film, transported, everyone, back, october, 2...",positive


In [34]:
# Perform stemming

from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [35]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z


In [None]:
stem_words(['I','loved','loving','it'])

In [None]:
df['review']=df['review'].apply(stem_words)

In [None]:
df

In [None]:
# Join back

def join_back(list_input):
    return " ".join(list_input)


In [None]:
df['review']=df['review'].apply(join_back)

In [None]:
df['review']

In [None]:
X=df.iloc[:,0:1].values

In [None]:
X.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500)

In [None]:
X=cv.fit_transform(df['review']).toarray()

In [None]:
X.shape

In [None]:
X[0].mean()

In [None]:
y=df.iloc[:,-1].values

In [None]:
y.shape

In [None]:
# X,y
# Training set
# Test Set(Already know the result)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [None]:
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

In [None]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [None]:
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

In [None]:
y_test.shape

In [None]:
y_pred1.shape

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Gaussian",accuracy_score(y_test,y_pred1))
print("Multinomial",accuracy_score(y_test,y_pred2))
print("Bernaulli",accuracy_score(y_test,y_pred3))