In [1]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from contractions import fix

# Descargar recursos (ejecutar una vez)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/macbook/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# NLP

## Análisis de Sentimiento

### user_reviews_final.parquet

In [2]:
user_reviews_content = '../_data/02_user_reviews_final.parquet'
user_reviews_dataset = pd.read_parquet(user_reviews_content)

### Tokenizado, Lematizado y Stopwords

Dado que Vader es una herramienta específica para el análisis de sentimientos, está orientada a textos en inglés, utiliza un diccionario y reglas gramaticales para asignar puntuaciones de polaridad, será de gran utilidad para nuestro trabajo.

Dado que el modelo Vader, como tantos otros modelos de análisis, puede tener dificultades con el sarcasmo o los dobles sentidos, realizar la tokenización, lematizado y filtrado de stopwords, es muy importante y no es necesario dejar la forma en que las reviews fueron escritas.

Tambien se generará un pequeño diccionario con palabras de connotacion positiva y negativa referidas a juegos, para que luego sean respetados. Además Regex servirá para quitar emojis y caracteres especiales para que el proceso genere mejores resultados. Luego Fix arreglará las contracciones del ingés para que quede todo el contenido necesario para hacer el análisis.

In [3]:
def preprocess_text(text):
    if not text or pd.isnull(text):  # Verificar si el texto está vacío o es nulo
        return ''
    
    # Eliminar emojis y caracteres especiales
    # Se pueden tokenizar los emojis con metodos mas robustos
    # que ayuden a manejarlos, pero se elije eliminarlos
    text = re.sub(r'[^\w\s]', '', text)

    # Arreglo de contracciones 
    try:
        text = fix(text)  # Intentar usar la función fix()
    except Exception as e:
        print(f"Error al aplicar fix(): {e}")
          
    # Tokenización
    tokens = word_tokenize(text)
    
    # Palabras clave adicionales, positivas y negativas para mejorar el análisis
    negative_game_words = ['bug', 'glitch', 'crash', 'slow', 'problem', 'unstable', 'poorly',
                           'unbalanced','boring', 'monotonous', 'repetitive', 'challenging', 
                           'frustrating', 'cumbersome','incomplete', 'inconsistent', 'cheating',
                           'dubious', 'disastrous', 'abandoned','no','not']
    positive_game_words = ['exciting', 'immersive', 'engaging', 'innovative', 'fantastic',
                           'amazing', 'rewarding','thrilling', 'captivating', 'satisfying', 
                           'exhilarating', 'immersive', 'enjoyable','spectacular', 'brilliant',
                           'masterpiece', 'awesome', 'stellar', 'fun', 'unique']

    # Obtener stopwords y agregar palabras con connotación positiva y negativa
    stop_words = set(stopwords.words('english'))

    for word in negative_game_words:
        stop_words.discard(word)
    for word in positive_game_words:
        stop_words.discard(word)

    # Eliminación de stopwords y puntuación no necesarios
    tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]

    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)
user_reviews_dataset['preprocess_text'] = user_reviews_dataset['review'].apply(preprocess_text)

Error al aplicar fix(): string index out of range


Análisis de sentimiento con vader, da 0 para mal review, 1 para neutro y 2 para positivo

In [4]:
# Función para analizar el sentimiento con VADER
def analizar_sentimiento(texto):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(texto)
    compound_score = scores['compound']
    if compound_score >= 0.05:
        return 2 # Review Positiva
    elif compound_score <= -0.05:
        return 0 # Review Negativa
    else:
        return 1 # Review Neutra
    
user_reviews_dataset['sentiment_analysis'] = user_reviews_dataset['preprocess_text'].apply(analizar_sentimiento)

### Revisión de 'sentiment_analysis' vs. 'review'

A continuación vemos los resultados que se obtuvieron del análisis de sentimiento y comparamos con la columna 'review'.

In [5]:
filas_neg = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 0]
filas_neut = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 1]
filas_pos = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 2]

In [6]:
pd.set_option('display.max_colwidth', None)

- Comparamos las filas negativas con la review

In [7]:
filas_neg[['review','sentiment_analysis']].head()

Unnamed: 0,review,sentiment_analysis
9,"Random drops and random quests, with stat points. Animation style reminiscent of the era before the Voodoo card.",0
16,"The ending to this game is.... ♥♥♥♥♥♥♥.... Just buy it, you'll be invested, im automatically preordering season two of the walking dead game.",0
23,"It reminds me of that TV Show called ""The Walking Dead"".",0
26,You don't have to get this game. It's not like it's the greatest FPS of all time or anything. But have a think about it.,0
29,"Killed the Emperor, nobody cared and got away with it. Accidentally killed a chicken and everybody decided to gang up on me. 10/10",0


- Comparamos las filas neutras con la review

In [8]:
filas_neut[['review','sentiment_analysis']].head()

Unnamed: 0,review,sentiment_analysis
18,Git gud,1
22,This game is Marvellous.,1
27,ZIKA DO BAILE,1
30,10/10 would eat your money for hats and keys,1
32,mt bom,1


- Comparamos las filas positivas con la review

In [9]:
filas_pos[['review','sentiment_analysis']].head()

Unnamed: 0,review,sentiment_analysis
0,"Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.",2
1,It's unique and worth a playthrough.,2
2,Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!,2
3,"I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8",2
4,"For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.",2


A simple vista parece que las columnas tienen cierta correspondencia, pero al ser algo subjetivo y de índole coloquial, quizás habría que analizar con un modelo que tome mejor los giros idiomaticos y demás variaciones que podrían darle mayor exactitud al análisis. Cabe decir que algunos reviews pueden ser un poco ambiguos y dificiles de categorizar.
En el análisis exploratorio veremos con mayor detenimiento los resultados obtenidos en esta etapa y que nivel de correspondencia tienen con las recomendaciones.

###  Guardamos el df con las columnas necesarias para el análisis posterior

In [10]:
# Guardar DataFrame en un archivo Parquet
user_reviews_dataset[['user_id', 'item_id', 'posted', 'recommend', 'sentiment_analysis']].to_parquet('../_data/02_user_reviews_NLP.parquet', index=False)