In [None]:
#codigo para cargar los datos en 2 df, uno para el game_info y otro para las reviews

import pandas as pd
import ast

#esta en chunks porque mi ram no lo soportaba

CHUNK_SIZE = 100000 
def json_line_generator(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                data_object = ast.literal_eval(line)
                yield data_object

def load_data_in_chunks(file_path, chunk_size):
    all_chunks = []
    current_chunk = []
    record_generator = json_line_generator(file_path)
    for record in record_generator:
        current_chunk.append(record)
        if len(current_chunk) >= chunk_size:
            all_chunks.append(pd.DataFrame(current_chunk))
            current_chunk = [] 
    if current_chunk:
        all_chunks.append(pd.DataFrame(current_chunk))

    if all_chunks:
        return pd.concat(all_chunks, ignore_index=True)
    else:
        return pd.DataFrame()


file_name_games = '/content/drive/MyDrive/steam_games.json'
df_games = load_data_in_chunks(file_name_games, CHUNK_SIZE)
print("DataFrame de steam_games creado con éxito.")
print("\nNúmero total de registros cargados:", len(df_games))

file_name_reviews = '/content/drive/MyDrive/steam_new.json'
df_reviews = load_data_in_chunks(file_name_reviews, CHUNK_SIZE)
print("DataFrame de steam_new creado con éxito.")
print("\nNúmero total de registros cargados:", len(df_reviews))

In [None]:
#poner una columna de el nombre del juego en el df de reviews
df_games_info = df_games[['id', 'title']].rename(columns={'id': 'product_id', 'title': 'game_title'})
df_reviews = pd.merge(
    df_reviews,                       
    df_games_info,                    
    on='product_id',                  
    how='left'                        
)

In [None]:
df_reviews.head(5)

In [None]:
#codigo para poner el texto de las reviews en minuscula, tokenizarlo y quitar stopwords

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    if text is None or text == "":
        return []
    
    #minuscula + ruido
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) #URLs
    text = re.sub(r'<.*?>', '', text)       #etiquetas HTML
    text = re.sub(r'[^a-záéíóúüñ\s]', '', text) #caracteres no alfabéticos 

    #tokenizacion
    tokens = word_tokenize(text)
    
    #stopwords 
    spanish_stopwords = set(stopwords.words('spanish'))
    english_stopwords = set(stopwords.words('english'))
    all_stopwords = spanish_stopwords.union(english_stopwords)

    filtered_tokens = [word for word in tokens if word not in all_stopwords and len(word) > 1]
    
    return filtered_tokens

df_reviews['processed_tokens'] = df_reviews['text'].apply(preprocess_text)
df_reviews['processed_text'] = df_reviews['processed_tokens'].apply(lambda x: ' '.join(x))

In [None]:
import gensim
from gensim.models import Word2Vec

sentences = df_reviews['processed_tokens'].tolist()

VECTOR_SIZE = 100     
WINDOW_SIZE = 5       
MIN_COUNT = 5         
WORKERS = 4           
ITERATIONS = 10       

print("Iniciando entrenamiento de Word2Vec...")

word2vec_model = Word2Vec(
    sentences=sentences, 
    vector_size=VECTOR_SIZE, 
    window=WINDOW_SIZE, 
    min_count=MIN_COUNT, 
    workers=WORKERS,
    sg=0,
    epochs=ITERATIONS
)
print(f"Vocabulario total aprendido: {len(word2vec_model.wv.index_to_key)} palabras.")

#prueba
word = 'game'
if word in word2vec_model.wv:
    similares = word2vec_model.wv.most_similar(word, topn=5)
    print(f"\nPalabras similares a '{word}':")
    for similar_word, score in similares:
        print(f"  - {similar_word} (Similitud: {score:.4f})")
else:
    print(f"La palabra '{word}' no se encuentra en el vocabulario (min_count={MIN_COUNT}).")
