# Naive Bayes Project Tutorial

In [5]:
#!pip install -r ../requirements.txt

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics


import unicodedata
import re

## Naive sentiment analysis

This is a simple project using Naive Bayes Classifier and Scikit-learn to create a Google Play store reviews classifier (Sentiment Analysis) in Python. You will categorize user reviews as good or bad. The Naive Bayes classification technique is a simple and powerful classification task in machine learning. The Naive Bayes classification technique is a simple and powerful classification task in machine learning. In this dataset, we use the 23 most popular mobile apps, and only two columns.

In [7]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv'
df = pd.read_csv(url)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [9]:
df.sample(10, random_state=10)

Unnamed: 0,package_name,review,polarity
590,com.evernote,ătoo slow loading webclips!!! ăno pdf su...,0
131,com.king.candycrushsaga,lost power ups switched phones and lost my po...,0
628,com.uc.browser.en,update few things.... uc mini is very speed f...,1
195,com.imangi.templerun2,nice game.... its simply amazing...but i woul...,1
230,com.supercell.clashofclans,new update killed the game new matching for a...,0
646,com.uc.browser.en,great! keep it up! i've been using uc mini fo...,1
75,com.twitter.android,ads placement is anoying we know ads was one ...,0
586,com.evernote,widget sabotaged this app functions ok as a n...,1
569,jabanaki.todo.todoly,works great! i wish it supported expandable ...,1
287,com.tencent.mm,not letting me log in! i try to log into my a...,0


In [10]:
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [11]:
df.tail()

Unnamed: 0,package_name,review,polarity
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1
890,com.rovio.angrybirds,they're everywhere i see angry birds everywhe...,1


## Step 1:

We have three columns: package name, review and polarity (0 = bad, 1 = good) Preprocess the data by eliminating the package name column and putting all reviews in lower case.

In [12]:
def normalize_str(text_string):
    if text_string is not None:
        result = unicodedata.normalize("NFD",text_string).encode("ascii","ignore").decode()
    else:
        result = None
            
    return result
    

In [13]:
def preprocess(df):
    # Remove package name column
    df= df.drop('package_name', axis=1)
    
    # Convert text to lowercase and remove white spaces
    df['review'] = df['review'].str.strip().str.lower()
    
    #Remove symbols that are irrelevant
    df['review'] = df['review'].str.replace("!","")
    df['review'] = df['review'].str.replace(",","")
    df['review'] = df['review'].str.replace("&","")
    
    
    #Normalize text
    df['review'] = df['review'].str.normalize("NFKC")
    df['review']= df['review'].apply(normalize_str)
    
    #Remove extra letters in words (Loooove, Haaaate)
    df['review'] = df['review'].str.replace(r"([a-zA-Z])\1{2,}",r"\1",regex=True)
    
    
    return df

In [14]:
df = preprocess(df)

In [15]:
df.iloc[675,]

review      loved it rzrl app rrlrrl rrsrzrlrrl r r rsrzrl...
polarity                                                    1
Name: 675, dtype: object

In [16]:
df.sample(10, random_state=10)

Unnamed: 0,review,polarity
590,atoo slow loading webclips ano pdf support at ...,0
131,lost power ups switched phones and lost my pow...,0
628,update few things.... uc mini is very speed fo...,1
195,nice game.... its simply amazing...but i would...,1
230,new update killed the game new matching for at...,0
646,great keep it up i've been using uc mini for q...,1
75,ads placement is anoying we know ads was one o...,0
586,widget sabotaged this app functions ok as a no...,1
569,works great i wish it supported expandable / c...,1
287,not letting me log in i try to log into my acc...,0


In [17]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,messenger issues ever since the last update in...,0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [18]:
df.tail()

Unnamed: 0,review,polarity
886,loved it i loved it because it is incredible ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1
890,they're everywhere i see angry birds everywher...,1


## Step 2:

Separate target from feature, and split your data.

In [19]:
X = df['review']
y = df['polarity']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=15, stratify=y)

## Step 3:

Vectorize your features and use Naive Bayes to classify the reviews as good or bad. We will not focus on hypertuning our model this time. This was an introduction project to sentiment analysis using Naive Bayes.

**Vectorize**

In [21]:
vec = CountVectorizer(stop_words='english')
X_train = vec.fit_transform(X_train).toarray()
X_test=vec.transform(X_test).toarray()

**Fit model**

In [22]:
model = MultinomialNB()
model.fit(X_train, y_train)

**Score**

In [23]:
model.score(X_test, y_test)

0.8789237668161435

**Predictions**

In [24]:
model.predict(vec.transform(['This app is awesome']))

array([1])

1 = good

In [25]:
model.predict(vec.transform(['This app is very slow']))

array([0])

0 = bad