# Proyecto 7 Bootcamp de Ciencia de Datos e Inteligencia Artificial

## Análisis y preprocesamiento de datos

In [1]:
import pandas as pd

# Cargar el conjunto de datos
df = pd.read_csv('googleplaystore_user_reviews.csv')

# Visualizar las primeras filas del dataframe
print(df.head())

# Información del dataframe
print(df.info())

# Verificar valores faltantes
print(df.isnull().sum())

# Eliminar filas con valores faltantes
df.dropna(inplace=True)

# Filtrar solo las columnas necesarias (Texto de la reseña y Sentimiento)
df = df[['Translated_Review', 'Sentiment']]

# Visualizar las primeras filas después del filtrado
print(df.head())


                     App                                  Translated_Review  \
0  10 Best Foods for You  I like eat delicious food. That's I'm cooking ...   
1  10 Best Foods for You    This help eating healthy exercise regular basis   
2  10 Best Foods for You                                                NaN   
3  10 Best Foods for You         Works great especially going grocery store   
4  10 Best Foods for You                                       Best idea us   

  Sentiment  Sentiment_Polarity  Sentiment_Subjectivity  
0  Positive                1.00                0.533333  
1  Positive                0.25                0.288462  
2       NaN                 NaN                     NaN  
3  Positive                0.40                0.875000  
4  Positive                1.00                0.300000  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------     

## Preprocesamiento de texto

In [6]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Descargar recursos de NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Función para preprocesar el texto
def preprocess_text(text):
    # Convertir texto a minúsculas
    text = text.lower()
    # Eliminar caracteres no alfabéticos
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenizar el texto
    tokens = word_tokenize(text)
    # Eliminar palabras vacías (stopwords)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lematizar las palabras
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Unir tokens en una sola cadena
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Aplicar la función de preprocesamiento al conjunto de datos
df['Preprocessed_Review'] = df['Translated_Review'].apply(preprocess_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nrubi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nrubi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nrubi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Entrenamiento del modelo

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Dividir los datos en conjunto de entrenamiento y prueba
X = df['Preprocessed_Review']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizar el texto usando TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Entrenar un clasificador Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vect, y_train)

# Realizar predicciones
y_pred = nb_classifier.predict(X_test_vect)

# Evaluar el modelo
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    Negative       0.90      0.41      0.57      1653
     Neutral       0.85      0.09      0.17      1049
    Positive       0.71      0.99      0.83      4784

    accuracy                           0.74      7486
   macro avg       0.82      0.50      0.52      7486
weighted avg       0.77      0.74      0.68      7486



In [30]:
# Prueba del modelo Naive Bayes
#---------------------------------------------------
test_text = "This app is amazing! It works perfectly and has great features."

# Preprocesar el texto de prueba
preprocessed_test_text = preprocess_text(test_text)

# Vectorizar el texto de prueba
test_text_vectorized = vectorizer.transform([preprocessed_test_text])

# Realizar la predicción
predicted_label = nb_classifier.predict(test_text_vectorized)[0]

# Mapear las etiquetas predichas a los sentimientos
sentiment_mapping = {
    'Negative': 'Negativo',
    'Positive': 'Positivo',
    'Neutral': 'Neutral'
}

predicted_sentiment = sentiment_mapping.get(predicted_label, "Desconocido")
print("Sentimiento predicho para '{}': {}".format(test_text, predicted_sentiment))

# Calcular la probabilidad de cada clase para el texto de prueba
probabilities = nb_classifier.predict_proba(test_text_vectorized)[0]

# Obtener la probabilidad de la clase "Positive"
positive_probability = probabilities[nb_classifier.classes_ == "Positive"][0]

# Imprimir la probabilidad de la clase "Positive"
print("Probabilidad de la clase 'Positive': {:.2%}".format(positive_probability))

Sentimiento predicho para 'This app is amazing! It works perfectly and has great features.': Positivo
Probabilidad de la clase 'Positive': 98.30%


In [9]:
import joblib

joblib.dump(nb_classifier, 'sentiment_analysis_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']