In [1]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from contractions import fix

# Descargar recursos (ejecutar una vez)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/macbook/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# NLP

## Análisis de Sentimiento

### user_reviews_final.parquet

In [2]:
# Direccion del archivo comprimido y reconvertido
user_reviews_content = 'data/02_user_reviews_final.parquet'

# Cargar directamente el archivo JSON comprimido en un DataFrame
user_reviews_dataset = pd.read_parquet(user_reviews_content)

Token Lematizado 

Palabras connotacion Positiva y Negativa Referidas a Juegos
Regex para quitar emojis y caracteres especiales para que el análisis produzca mejores resultados
Eliminar contracciones del ingés

In [3]:
def preprocess_text(text):
    """_summary_

    Args:
        text (_type_): _description_

    Returns:
        _type_: _description_
    """
    if not text or pd.isnull(text):  # Verificar si el texto está vacío o es nulo
        return ''
    
    # Eliminar emojis y caracteres especiales
    # Se pueden tokenizar los emojis con metodos mas robustos
    # que ayuden a manejarlos, pero se elije eliminarlos
    text = re.sub(r'[^\w\s]', '', text)

    # Eliminación de contracciones 
    try:
        text = fix(text)  # Intentar usar la función fix()
    except Exception as e:
        print(f"Error al aplicar fix(): {e}")
          
    # Tokenización
    tokens = word_tokenize(text)
    
    # Palabras clave adicionales, positivas y negativas para mejorar el análisis
    negative_game_words = ['bug', 'glitch', 'crash', 'slow', 'problem', 'unstable', 'poorly',
                           'unbalanced','boring', 'monotonous', 'repetitive', 'challenging', 
                           'frustrating', 'cumbersome','incomplete', 'inconsistent', 'cheating',
                           'dubious', 'disastrous', 'abandoned','no','not']
    positive_game_words = ['exciting', 'immersive', 'engaging', 'innovative', 'fantastic',
                           'amazing', 'rewarding','thrilling', 'captivating', 'satisfying', 
                           'exhilarating', 'immersive', 'enjoyable','spectacular', 'brilliant',
                           'masterpiece', 'awesome', 'stellar', 'fun', 'unique']

    # Obtener stopwords y agregar palabras con connotación positiva y negativa
    stop_words = set(stopwords.words('english'))

    for word in negative_game_words:
        stop_words.discard(word)
    for word in positive_game_words:
        stop_words.discard(word)


    # Eliminación de Stopwords y Puntuación
    tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]

    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)
user_reviews_dataset['preprocess_text'] = user_reviews_dataset['review'].apply(preprocess_text)

Error al aplicar fix(): string index out of range


Análisis de sentimiento con vader, da 0 para mal review, 1 para neutro y 2 para positivo

In [4]:
# Función para analizar el sentimiento con VADER
def analizar_sentimiento(texto):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(texto)
    compound_score = scores['compound']
    if compound_score >= 0.05:
        return 2 # Review Positiva
    elif compound_score <= -0.05:
        return 0 # Review Negativa
    else:
        return 1 # Review Neutra
    
user_reviews_dataset['sentiment_analysis'] = user_reviews_dataset['preprocess_text'].apply(analizar_sentimiento)

A continuación vemos los resultados que se obtuvieron del Análisis de Sentimiento y puedo analizar si la columna 'sentiment_analysis' tiene alguna correspondencia con la columna 'recommend' y tambien con la 'review'.

In [5]:
# Ver filas con 0 -> malo/negativo
filas_neg = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 0]
# Ver filas con 1 -> neutro/neutrales
filas_neut = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 1]
# Ver filas con 2 -> buenas/positivas
filas_pos = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 2]
# Mostrar las filas que contienen '2' en la columna 'sentiment analysis'

In [6]:
pd.set_option('display.max_colwidth', None)

Veo filas negativas

In [7]:
filas_neg[['review','recommend','sentiment_analysis']].head(1)

Unnamed: 0,review,recommend,sentiment_analysis
9,"Random drops and random quests, with stat points. Animation style reminiscent of the era before the Voodoo card.",True,0


Veo filas neutras

In [8]:
filas_neut[['review', 'recommend','sentiment_analysis']].head(1)

Unnamed: 0,review,recommend,sentiment_analysis
18,Git gud,True,1


Veo filas positivas

In [9]:
filas_pos[['review', 'recommend','sentiment_analysis']].head(1)

Unnamed: 0,review,recommend,sentiment_analysis
0,"Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.",True,2


Guardamos el df con las columnas necesarias para el análisis

In [10]:
# Guardar DataFrame en un archivo Parquet
user_reviews_dataset[['user_id', 'item_id', 'posted', 'recommend', 'sentiment_analysis']].to_parquet('data/02_user_reviews_NLP.parquet', index=False)