In [26]:
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import joblib

import mysql.connector

import sys

import streamlit as st

import matplotlib.pyplot as plt

import numpy as np

from textblob import TextBlob

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_parquet("resultado.parquet",engine='fastparquet')

In [3]:
df.head(10)

Unnamed: 0,business_id,user_id,text,compliment_count,stemmed_content,sentiment_analysis
0,CF33F8-E6oudUQ46HnavjQ,ZnGIEODjnJ7MlvxCSNaiZw,Love sonic but orders are constantly wrong...,0,love sonic order constantli wrong,1
1,CF33F8-E6oudUQ46HnavjQ,AsBH7jQhHNnrQJ5XJhcnrw,Foods always been good. Shakes r delicious!,0,food alway good shake r delici,2
2,ROeacJQwBeh05Rqg7F6TCg,UV8Fvw2QTepwFiLDWRn6Ug,They're open on Sunday's until 9 pm!,0,open sunday pm,1
3,ROeacJQwBeh05Rqg7F6TCg,6zC6DNs9aHk5V18KWWjGJg,"Small place, very comfy",0,small place comfi,0
4,ROeacJQwBeh05Rqg7F6TCg,mY0piA_DiSAmOT-OGvU-yA,This is now my favorite place. Third trip sinc...,0,favorit place third trip sinc open earli decem...,2
5,ROeacJQwBeh05Rqg7F6TCg,U3iV0iBDqBx-QlJJYwgTEQ,"Very delicious, fresh, fast and a good price!",0,delici fresh fast good price,2
6,ROeacJQwBeh05Rqg7F6TCg,uF5zU6jOd2eV7lg4HR9y-A,Excellent place to eat\nThe serve was great,0,excel place eat serv great,2
7,ROeacJQwBeh05Rqg7F6TCg,LwuHCmFSeFhCGeschjwmnQ,Cash only and great friendly service. Eat in a...,0,cash great friendli servic eat take option avail,2
8,ROeacJQwBeh05Rqg7F6TCg,X_EP3kbaQpqTFtI10oAg5Q,I saw on several reviews and the description t...,0,saw sever review descript place cash went toni...,1
9,ROeacJQwBeh05Rqg7F6TCg,YSI6ca4Wc50xfQz3pDgAZA,"One dish, various options. Simple but good. Ve...",0,one dish variou option simpl good good,2


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251444 entries, 0 to 251443
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   business_id       251444 non-null  object
 1   user_id           251444 non-null  object
 2   text              251444 non-null  object
 3   compliment_count  251444 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 7.7+ MB


In [41]:
df['text'].astype('str').dtypes

dtype('O')

# Revision datos

In [7]:
df['text'].isnull().sum()

0

In [8]:
df['text'].isna().sum()

0

In [9]:
df['text'].duplicated().sum()

11505

In [11]:
df = df.drop_duplicates()

In [12]:
df['text'].duplicated().sum()

10929

# ML

##### stemming

In [4]:
port_stem = PorterStemmer()

In [47]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content

In [48]:
df['stemmed_content'] = df['text'].apply(stemming)

##### analisis sentimiento

In [51]:
def analizar_polaridad(review):
    analysis = TextBlob(review)
    polaridad = analysis.sentiment.polarity
    if polaridad < -0.1:
        return 0 #review negativa
    elif polaridad > 0.1:
        return 2 #review positiva
    else:
        return 1 #review neutra

In [52]:
df['sentiment_analysis'] = df['stemmed_content'].apply(analizar_polaridad)

In [72]:
df.to_parquet('resultado.parquet',engine = 'fastparquet')

##### Modelo

In [None]:
cnx = mysql.connector.connect(user= 'sqlhenry',password= 'soyhenry',host='35.247.37.77',database='sqlhenry')

In [6]:
X = df['stemmed_content']
Y = df['sentiment_analysis']

In [7]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [8]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [9]:
model = LinearRegression()

In [10]:
model.fit(X_train,Y_train)

In [11]:
Y_pred_test = model.predict(X_test)
Y_pred_train = model.predict(X_train)

In [13]:
Y_pred_train

array([1.2779951 , 1.10568962, 1.33141578, ..., 1.04306265, 1.08642252,
       1.19669693])

##### Metricas

In [27]:
coeficiente = model.coef_[0]
coeficiente

-0.2862461754880208

In [30]:
mean_squeared = mean_squared_error(Y_train,Y_pred_train)
mean_squeared

0.09857970412377445

In [34]:
r2 = r2_score(Y_train,Y_pred_train)
r2

0.7401132809928391

In [11]:
joblib.dump(model, 'modelo_regresion.pkl')

['modelo_regresion.pkl']