In [5]:
import pandas as pd
import gzip 
import ast
import pyarrow as pa
import pyarrow.parquet as pq

In [6]:
datos_json2 = []
# Abrir el archivo y leer línea por línea
with gzip.open('../json/user_reviews.json.gz', 'rt', encoding= 'utf-8') as f2:
    for li in f2:
        json_linea1 = ast.literal_eval(li)
        # Agregar el objeto JSON decodificado a la lista
        datos_json2.append(json_linea1)

In [7]:
reviews_df = pd.DataFrame(datos_json2)

In [8]:
print(reviews_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB
None


In [9]:
reviews_df.dropna(how='all', inplace=True) # 25799 entries

In [10]:
# Función para desanidar las reseñas
def desanidar_reseñas(row):
    user_id = row['user_id']
    user_url = row['user_url']
    reseñas = row['reviews']
    result = []
    for idx, res in enumerate(reseñas, start=1):
        res['user_id'] = user_id
        res['user_url'] = user_url
        res['review_id'] = f'{user_id}_review_{idx}'
        result.append(res)
    return pd.DataFrame(result)

In [11]:
reviews_dsn = pd.concat(reviews_df.apply(desanidar_reseñas, axis=1).tolist(), ignore_index=True)


In [12]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Luka\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
sia = SentimentIntensityAnalyzer()

umbral_negativo = -0.1
umbral_positivo = 0.1


for index, row in reviews_dsn.iterrows():
    if pd.notnull(row['review']):
        sentiment_score = sia.polarity_scores(row['review'])['compound']

        if sentiment_score < umbral_negativo:
            reviews_dsn.at[index, 'feeling'] = 0
        elif sentiment_score > umbral_positivo:
            reviews_dsn.at[index, 'feeling'] = 2
        else:
            reviews_dsn.at[index, 'feeling'] = 1
    else:
        reviews_dsn.at[index, 'feeling'] = 1

In [14]:
table = pa.Table.from_pandas(reviews_dsn)
pq.write_table(table, "../data_transformed/reviews.parquet", compression='snappy')

In [15]:
data = pq.read_table("../data_transformed/reviews.parquet")
user_reviews_df = data.to_pandas()

---------------------------------------------

In [16]:
a = user_reviews_df[user_reviews_df['feeling'] == 0]
b = a['recommend'].value_counts()
b

recommend
True     6427
False    2820
Name: count, dtype: int64

In [17]:
import re
# from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Luka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Luka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
reviews = reviews_dsn['review']

In [19]:
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove('no')
stopwords.remove('not')

In [20]:
def preproc_review (review):
    review = review.lower()
    review = re.sub("[^a-zA-Z]", " ", str(review)) # Elimina los caracteres que no sean letras o signos de exclamación
    review_wt = nltk.tokenize.word_tokenize(review) # Tokeniza y pasa a minúsculas el texto
    review_wt = [word for word in review_wt if word not in stopwords] # Elimina las stopwords
    review_wt = [lemmatizer.lemmatize(word) for word in review_wt]  # Aplicamos la funcion para buscar la raiz de las palabras
    review_wt = " ".join(review_wt) # Por ultimo volvemos a unir el titular
    return review_wt

In [21]:
reviews_list = []
for review in reviews:
    review_wt = preproc_review(str(review))
    reviews_list.append(review_wt)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Inicializar el vectorizador CountVectorizer para bolsa de palabras
vectorizador_bolsa_palabras = CountVectorizer(max_features=750, stop_words="english" , ngram_range=(1,2))

# Ajustar y transformar los textos en una matriz de características
matriz_caracteristicas_bolsa_palabras = vectorizador_bolsa_palabras.fit_transform(reviews_list)

# Obtener el vocabulario (tokens únicos)
vocabulario_bolsa_palabras = vectorizador_bolsa_palabras.get_feature_names_out()

# Imprimir la matriz de características y el vocabulario
print("Matriz de características (Bolsa de palabras):\n", matriz_caracteristicas_bolsa_palabras.toarray())
print("Vocabulario (Tokens únicos):", vocabulario_bolsa_palabras)

# # Inicializar el vectorizador TfidfVectorizer para TF-IDF
# vectorizador_tfidf = TfidfVectorizer(max_features=1000, stop_words="english" , ngram_range=(1,2))

# # Ajustar y transformar los textos en una matriz de características TF-IDF
# matriz_caracteristicas_tfidf = vectorizador_tfidf.fit_transform(reviews.tolist())

# # Imprimir la matriz de características TF-IDF
# print("\nMatriz de características (TF-IDF):\n", matriz_caracteristicas_tfidf.toarray())

Matriz de características (Bolsa de palabras):
 [[0 0 0 ... 0 0 2]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Vocabulario (Tokens únicos): ['ability' 'able' 'absolutely' 'access' 'achievement' 'action' 'actual'
 'actually' 'add' 'added' 'addicting' 'addictive' 'adventure' 'age' 'ago'
 'ai' 'al' 'alien' 'allows' 'alot' 'alpha' 'alright' 'amazing'
 'amazing game' 'amigo' 'animation' 'annoying' 'apart' 'area' 'arma' 'art'
 'aspect' 'atmosphere' 'attack' 'away' 'awesome' 'awesome awesome'
 'awesome game' 'awsome' 'bad' 'bad bad' 'base' 'based' 'basic'
 'basically' 'batman' 'battle' 'beat' 'beautiful' 'begin' 'believe' 'bem'
 'best' 'best game' 'beta' 'better' 'big' 'bit' 'black' 'blood' 'bom'
 'borderland' 'bored' 'boring' 'bos' 'boss' 'bought' 'bought game' 'break'
 'brilliant' 'bring' 'broken' 'bueno' 'bug' 'buggy' 'build' 'building'
 'bullet' 'button' 'buy' 'buy game' 'buying' 'called' 'came' 'campaign'
 'car' 'card' 'care' 'case