In [41]:
import pandas as pd
import glob
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tabulate import tabulate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import ndcg_score


In [4]:



generos = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'ScienceFiction', 'TVMovie', 'Thriller', 'War', 'Western']

def read_parquet(pasta):
    arquivos_parquet = glob.glob(os.path.join(pasta, '*.parquet'))
    lista_df = [pd.read_parquet(arquivo) for arquivo in arquivos_parquet]
    df = pd.concat(lista_df, ignore_index=True)
    return df

def normaliza_array(df, array_col):
    matrix = np.vstack(df[array_col].values)
    matrix_normalized = MinMaxScaler().fit_transform(matrix)
    df[array_col] = list(matrix_normalized)
    return df

def array_to_column(df, array_col, prefixo):
    nome_cols = [f'{prefixo}{g}' for g in generos]
    
    arrays_df = pd.DataFrame(df[array_col].tolist(), index=df.index, columns=nome_cols)
    
    return pd.concat([df.drop(array_col, axis=1), arrays_df], axis=1)

def treino_teste(df, percent):
    df = df.sort_values(by='timestamp')
    cutoff_index = int(len(df) * percent)
    cutoff_timestamp = df.iloc[cutoff_index]['timestamp']
    df_train = df[df['timestamp'] <= cutoff_timestamp]
    df_test = df[df['timestamp'] > cutoff_timestamp]
    return df_train, df_test

In [7]:
df_parquet = read_parquet('../output-train')
df_parquet = df_parquet.sample(frac=1)
df_parquet = array_to_column(df_parquet,'user_genero_avg', 'avg_')
df_parquet = array_to_column(df_parquet,'generos_movie', '')

In [8]:
df_train, df_test = treino_teste(df_parquet, 0.9)
df_candidatos = read_parquet('../geracao_candidatos').sample(n=100, random_state=42)

df_test = pd.merge(df_test, df_candidatos, on='imdbId', how='inner')

In [9]:
df_train = normaliza_array(df_train,'user_genero_count')
df_train = array_to_column(df_train,'user_genero_count', 'count_')
df_test = normaliza_array(df_test,'user_genero_count')
df_test = array_to_column(df_test,'user_genero_count', 'count_')

In [10]:
no_feature = ['imdbId', 'timestamp', 'userId', 'movieId', 'rating', 'tmdbId']
label = 'rating'
generical_features = ['movie_popularity', 'movie_overall_grade']

In [11]:
# teste com todas features

X_train = df_train.drop(columns=no_feature)
y_train = df_train[label]

X_test = df_test.drop(columns=no_feature)
y_test = df_test[label]


In [20]:
# # só features especificas do usuario
# X_train = X_train.drop(columns=generical_features)
# X_test = X_test.drop(columns=generical_features)

# # só features genericas
# X_train = df_train[generical_features]
# X_test = df_test[generical_features]

In [12]:
# Treinamento do modelo de regressão
model = LinearRegression()
model.fit(X_train, y_train)

# Avaliação do modelo
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.8203409440215338


In [54]:
# selecionar um usuário no teste com varios filmes e fazer o raking
user_counts = (
    df_test
    .groupby('userId')
    .size()
    .reset_index(name='user_counts')
)
frequent_users = user_counts[user_counts['user_counts'] >= 7]['userId']
df_filtered = df_test[df_test['userId'].isin(frequent_users)]


df_user = df_filtered.iloc[[19]]
user_id = df_user['userId'].tolist()[0]
df_movies_user = df_test[df_test['userId'] == user_id]
Y_user = df_movies_user[label]
X_user = df_movies_user.drop(columns=no_feature)
user_predictions = model.predict(X_user)

In [55]:

user_predictions_df = pd.DataFrame({
    'movieId': df_movies_user['movieId'],
    'predicted_rating': user_predictions,
    'Y_user': Y_user
})

# Gerando o ranking dos filmes com base nas previsões
user_ranking = user_predictions_df.sort_values(by='predicted_rating', ascending=False)
print(user_ranking.head(50))


    movieId  predicted_rating Y_user
314     858          4.523031    4.0
316    2324          4.268040    5.0
313    1258          4.210135    5.0
317    1270          3.912211    5.0
302     260          3.797797    3.0
373    1527          3.736726    3.0
334    3996          3.725981    5.0
315    2078          3.608398    5.0


In [56]:

true_ratings = np.array([user_predictions_df['Y_user'].values])
predicted_ratings = np.array([user_predictions_df['predicted_rating'].values])
ndcg = ndcg_score(true_ratings, predicted_ratings)

print(f"NDCG: {ndcg:.4f}")

NDCG: 0.9540
