# Explore here

It's recommended to use this notebook for exploration purposes.

For example: 

1. You could import the CSV generated by python into your notebook and explore it.
2. You could connect to your database using `pandas.read_sql` from this notebook and explore it.

In [34]:
# instrucciones

# Este es un proyecto simple que usa Naive Bayes Classifier y Scikit-learn para crear un clasificador de reseñas de la tienda Google Play (Análisis de sentimiento) en Python. 
# Clasificará las opiniones de los usuarios como buenas o malas.  
# La técnica de clasificación Naive Bayes es una tarea de clasificación simple y poderosa en el aprendizaje automático. 
# En este conjunto de datos, usamos las 23 aplicaciones móviles más populares y solo dos columnas.

In [35]:
# Importo las librerias

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from dotenv import load_dotenv
from sqlalchemy import create_engine
import os
from sklearn.naive_bayes import MultinomialNB


In [36]:
#load the .env file variables
load_dotenv()
connection_string = os.getenv('DATABASE_URL')
#print(connection_string)

In [37]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv', sep=',')

In [38]:
df_raw.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [39]:
df_raw.sample(10)

Unnamed: 0,package_name,review,polarity
70,com.twitter.android,poor customer service the heart button crashe...,0
553,com.dropbox.android,5-stars.! this app has saved my life on multi...,1
453,com.whatsapp,very nice app best feature is all friends get...,1
532,com.dropbox.android,"cool app except the notifications freeze, so...",0
499,com.Slack,perfect! very close to using the desktop cli...,1
661,com.hamrokeyboard,ââââbestââââ hamro nepali key...,1
541,com.dropbox.android,terrible worst app ever 45 mins trying to bac...,0
534,com.dropbox.android,i love dropbox...but.. i wish that the inter...,1
113,com.linkedin.android,groups??? ** edit: i changed my rating from o...,1
234,com.supercell.clashofclans,personal break don't mind the break necessari...,0


In [40]:
#Paso 1:

# Tenemos tres columnas: nombre del paquete, revisión y polaridad (0 = malo, 1 = bueno) 
# Preprocesar los datos eliminando la columna del nombre del paquete y poniendo todas las revisiones en minúsculas.

In [41]:
# remuevo la columna con el nombre del paquete

df_new = df_raw.drop('package_name', axis=1)

In [42]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    891 non-null    object
 1   polarity  891 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.0+ KB


In [43]:
df_new.sample(10) # se observan algunas cadenas de texto con caracteres

Unnamed: 0,review,polarity
358,đ u can not send vids on some android devi...,0
328,big problem new update showing all contact an...,0
639,best browser for android if you want a browse...,1
346,"excellent as long as you have good service, i...",1
173,new theme its a fun game.......its great to s...,1
650,malayalam font overlapping malayalam fonts ar...,0
190,no artifacts since 1 months artifacts stopped...,0
427,i can't see how to change the nicknames?! my ...,0
92,good app but......... why do post come up in ...,0
256,bookmarking is no good i used to be able to p...,0


In [44]:
# cambio todos los comentarios a minuscula

df_new['review'] = df_new['review'].str.strip().str.lower()
df_new

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


In [45]:
# Paso 2:

# Separe el objetivo de la característica y divida sus datos.

In [46]:
# Divido el data set 
X = df_new['review']
y = df_new['polarity']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.25, random_state=50)

In [48]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((668,), (668,), (223,), (223,))

In [49]:
# Paso 3:

# Vectorice sus características y use Naive Bayes para clasificar las revisiones como buenas o malas. 
# Esta vez no nos centraremos en hiperafinar nuestro modelo. 
# Este fue un proyecto de introducción al análisis de sentimientos usando Naive

In [50]:
vector_text = CountVectorizer(stop_words='english')
X_train = vector_text.fit_transform(X_train).toarray()
X_test = vector_text.transform(X_test).toarray()

In [51]:
# Creo el modelo Naive bayes, usare multinomial, porque tiene la posibilidad de 
# mejorar el objetivo osea la precision del modelo cuando trabajamos con datos desequilibrados, 
# Reduce el sesgo, igual la proporcion de comentarios postivos y negativos es relativamente
# equilibrada :) 

In [52]:
df_new['polarity'].value_counts()

0    584
1    307
Name: polarity, dtype: int64

In [53]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [54]:
y_pred = model.predict(X_train)

In [55]:
y_pred.shape

(668,)

In [56]:
X_train.shape

(668, 3187)

In [57]:
(y_train == y_pred).sum()

640

In [58]:
# Quiere decir que el modelo predijo acerto 640 veces de 891
640/891

0.7182940516273849

In [59]:
from sklearn import metrics

In [60]:
# Veamos la accuracy, precision y recall

print('Accuracy:',metrics.accuracy_score(y_test, model.predict(X_test)))
print('Precision:',metrics.precision_score(y_test, model.predict(X_test), average=None))
print('Recall:',metrics.recall_score(y_test, model.predict(X_test), average=None))

Accuracy: 0.8340807174887892
Precision: [0.83850932 0.82258065]
Recall: [0.92465753 0.66233766]


In [61]:
# Hago un par de predicciones para verificar que esta clasificando bien

In [62]:
model.predict(vector_text.transform(['fun works perfectly well. ads arent as annoyi']))

array([1])

In [63]:
model.predict(vector_text.transform(['great but... lags waay too much. i always end']))


array([0])

In [64]:
# Paso 4:

# Utilice app.py para crear su canalización.

# Guarde su modelo de clasificación naive bayes en la carpeta 'modelos'.



In [65]:
import pickle

Naive_Bayes = 'NaiveB_model.sav'
pickle.dump(model, open(Naive_Bayes,'wb'))