In [1]:
import pandas as pd
import glob
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tabulate import tabulate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [9]:



generos = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'ScienceFiction', 'TVMovie', 'Thriller', 'War', 'Western']

def read_parquet():
    pasta = '../output-treino'
    arquivos_parquet = glob.glob(os.path.join(pasta, '*.parquet'))
    lista_df = [pd.read_parquet(arquivo) for arquivo in arquivos_parquet]
    df = pd.concat(lista_df, ignore_index=True)
    return df

def normaliza_array(df, array_col):
    matrix = np.vstack(df[array_col].values)
    matrix_normalized = MinMaxScaler().fit_transform(matrix)
    df[array_col] = list(matrix_normalized)
    return df

def array_to_column(df, array_col, prefixo):
    nome_cols = [f'{prefixo}{g}' for g in generos]
    
    arrays_df = pd.DataFrame(df[array_col].tolist(), index=df.index, columns=nome_cols)
    
    return pd.concat([df.drop(array_col, axis=1), arrays_df], axis=1)

def treino_teste(df, percent):
    df = df.sort_values(by='timestamp')
    cutoff_index = int(len(df) * percent)
    cutoff_timestamp = df.iloc[cutoff_index]['timestamp']
    df_train = df[df['timestamp'] <= cutoff_timestamp]
    df_test = df[df['timestamp'] > cutoff_timestamp]
    return df_train, df_test






df_parquet = read_parquet()
# df_parquet = df_parquet.sample(frac=0.2)
df_parquet = normaliza_array(df_parquet,'user_genero_count')
df_parquet = array_to_column(df_parquet,'user_genero_avg', 'avg_')
df_parquet = array_to_column(df_parquet,'user_genero_count', 'count_')
df_parquet = array_to_column(df_parquet,'generos_movie', '')


df_train, df_test = treino_teste(df_parquet, 0.9)

In [3]:
no_feature = ['imdbId', 'timestamp', 'userId', 'movieId', 'rating', 'tmdbId']
label = 'rating'
generical_features = ['movie_popularity', 'movie_overall_grade']

In [22]:
# teste com todas features

X_train = df_train.drop(columns=no_feature)
y_train = df_train[label]

X_test = df_test.drop(columns=no_feature)
y_test = df_test[label]

In [20]:
# # só features especificas do usuario
# X_train = X_train.drop(columns=generical_features)
# X_test = X_test.drop(columns=generical_features)

# # só features genericas
# X_train = df_train[generical_features]
# X_test = df_test[generical_features]

In [23]:
# Treinamento do modelo de regressão
model = LinearRegression()
model.fit(X_train, y_train)

# Avaliação do modelo
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.8617931628821122


In [30]:
# selecionar um usuário no teste com varios filmes e fazer o raking
df_user = df_test.iloc[[99]]
user_id = df_user['userId'].tolist()[0]
df_movies_user = df_test[df_test['userId'] == user_id]
Y_user = df_movies_user[label]
X_user = df_movies_user.drop(columns=no_feature)
user_predictions = model.predict(X_user)

In [31]:

user_predictions_df = pd.DataFrame({
    'movieId': df_movies_user['movieId'],
    'predicted_rating': user_predictions,
    'Y_user': Y_user
})

# Gerando o ranking dos filmes com base nas previsões
user_ranking = user_predictions_df.sort_values(by='predicted_rating', ascending=False)
print(user_ranking.head(50))


       movieId  predicted_rating Y_user
7183       858          4.702253    5.0
113329    1221          4.614078    5.0
160164     356          4.497683    5.0
98780     1196          4.475279    5.0
275482     260          4.431191    5.0
81021     1222          4.416378    4.0
252479    2194          4.391864    4.0
30102     1201          4.391606    5.0
216563    2858          4.374094    4.0
128928    2571          4.310223    4.0
145959    1954          4.303096    5.0
16092     1262          4.290017    4.0
192718     110          4.285073    3.0
77118     1291          4.203509    5.0
137628    1242          4.202174    3.0
213733    1304          4.177406    4.0
193289     590          4.169967    5.0
130420    2529          4.154037    4.0
257418    1036          4.151866    4.0
73081     1198          4.148341    5.0
287858    2951          4.133528    4.0
42920     1200          4.111210    4.0
35374     1220          4.099628    4.0
269566    1197          4.083842    5.0


In [None]:
# vies de raking
# geração de candidatos
# ndcg