In [24]:
from nltk.stem import PorterStemmer
from stop_words import get_stop_words
import json
import pandas as pd
import unicodedata
import re
import numpy as np

In [11]:
# Creamos funciones para normalizar texto, eliminar stopwords y aplicar stemming a reseñas
# Se usan para limpiar y transformar las frases antes de analizarlas

def sentence_normalization(sentence):
    sentence = unicodedata.normalize('NFKD', sentence).lower().encode('ascii', errors='ignore').decode('utf-8')
    sentence = re.sub(' +', ' ', ' '.join([word if word.isalpha() else '' for word in sentence.split()])).strip()
    return sentence


def remove_stopwords(sentence, sw_list):
    sentence = ' '.join([word for word in sentence.split() if word not in sw_list])
    return sentence


stemmer = PorterStemmer()
def apply_stemming(sentence): 
    return ' '.join([stemmer.stem(word) for word in sentence.split()])

sw_list = get_stop_words('en')

def process_reviews(reviews, sw_list):
    processed_sentences = []
    for sent in reviews:
        if not sent != sent:
            sent = sentence_normalization(sent)
            sent = remove_stopwords(sent, sw_list)
            sent = apply_stemming(sent)
            processed_sentences.append(sent)
        else:
            processed_sentences.append('None')
    return processed_sentences

In [13]:
data = {}
with open('C:/Users/noeli/Desktop/Practica_NLP/Video_Games_5.json/Video_Games_5.json', "r", encoding="utf-8") as f:
    for idx, line in enumerate(f):
        data[idx] = json.loads(line)

In [16]:
# Extraemos texto y sentimiento de las reseñas del diccionario 'data' y creamos un DataFrame
# Eliminamos filas con valores faltantes en 'review' o 'sentiment' y reiniciamos el índice

reviews_text = []
reviews_sentiment = []

for review in data.values():
    reviews_text.append(review.get('reviewText'))
    reviews_sentiment.append(review.get('overall'))

df_reviews = pd.DataFrame({
    'review': reviews_text,
    'sentiment': reviews_sentiment
})
df_reviews.dropna(subset=['review', 'sentiment'], inplace=True)
df_reviews.reset_index(drop=True, inplace=True)

In [22]:
# Aplicamos el preprocesamiento a las reseñas: limpieza, eliminación de stopwords y stemming
processed_reviews = process_reviews(df_reviews['review'], sw_list)

In [25]:
# Creamos una nueva columna con las reseñas procesadas
# Reemplazamos las cadenas vacías por NaN y eliminamos las filas que quedaron vacías tras el preprocesamiento

df_reviews['processedReview'] = processed_reviews
df_reviews['processedReview'] = processed_reviews 
df_reviews['processedReview'] = df_reviews['processedReview'].replace('', np.nan) 
df_reviews = df_reviews.dropna(subset=['processedReview']) 