# Tarea 2 - Sistema de recomendación
## Inteligencia de negocios

### Bibliotecas usadas

In [2]:
import pandas as pd
from dotenv import load_dotenv
import requests
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import scipy.stats as stats
# import scipy.sparse as sparse
plt.rcParams["figure.figsize"] = (12,10)
from scipy import sparse
from lightfm.cross_validation import random_train_test_split+
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

load_dotenv()



True

In [3]:
movie_titles = pd.read_csv("movie_titles.csv", usecols=range(3), names=['id', 'year', 'name'], encoding='ISO-8859-1')
movie_titles

Unnamed: 0,id,year,name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


#### Identificación y corrección de columnas vacías

In [4]:
print("----- Campos NaN en columna id -----\n", movie_titles[movie_titles['name'].isna()])
print("----- Campos NaN en columna año -----\n", movie_titles[movie_titles['year'].isna()])
print("----- Campos NaN en columna nombre -----\n", movie_titles[movie_titles['id'].isna()])

----- Campos NaN en columna id -----
 Empty DataFrame
Columns: [id, year, name]
Index: []
----- Campos NaN en columna año -----
           id  year                                         name
4387    4388   NaN      Ancient Civilizations: Rome and Pompeii
4793    4794   NaN  Ancient Civilizations: Land of the Pharaohs
7240    7241   NaN     Ancient Civilizations: Athens and Greece
10781  10782   NaN                       Roti Kapada Aur Makaan
15917  15918   NaN                      Hote Hote Pyaar Ho Gaya
16677  16678   NaN                              Jimmy Hollywood
17666  17667   NaN                           Eros Dance Dhamaka
----- Campos NaN en columna nombre -----
 Empty DataFrame
Columns: [id, year, name]
Index: []


#### Llenado de la información faltante
Para no tener que eliminar estas películas, primero probamos la API de The Movie Database, en caso de que esta pudiera entregarnos los años de cada película.

In [5]:
api_key = os.getenv("API_KEY")

for movie in movie_titles[movie_titles['year'].isna()]['name']:
    url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={movie}"
    response = requests.get(url)
    data = response.json()
    try:
        relase_date = data['results'][0]['release_date']
        if relase_date == '':
            raise
        release_year = relase_date[:4]
        print("🟢 La fecha de lanzamiento de " + movie + " es " + relase_date + ",
              del año " + release_year + ".")
        movie_titles.loc[movie_titles['name'] == movie, 'year'] = release_year
    except: print("🔴 No se encontró fecha de lanzamiento para " + movie + ".")

🔴 No se encontró fecha de lanzamiento para Ancient Civilizations: Rome and Pompeii.
🔴 No se encontró fecha de lanzamiento para Ancient Civilizations: Land of the Pharaohs.
🔴 No se encontró fecha de lanzamiento para Ancient Civilizations: Athens and Greece.
🟢 La fecha de lanzamiento de Roti Kapada Aur Makaan es 1974-01-01, del año 1974.
🔴 No se encontró fecha de lanzamiento para Hote Hote Pyaar Ho Gaya.
🟢 La fecha de lanzamiento de Jimmy Hollywood es 1994-03-30, del año 1994.
🔴 No se encontró fecha de lanzamiento para Eros Dance Dhamaka.


Ya que solo se encontró el año de lanzamiento de dos películas, se procede a hacer una búsqueda manual en Google para llenar las películas faltantes.

In [6]:
movie_titles.at[4387, 'year'] = 2001
movie_titles.at[4793, 'year'] = 2001
movie_titles.at[7240, 'year'] = 2002
movie_titles.at[15917, 'year'] = 1999
movie_titles.at[17666, 'year'] = 1999
movie_titles = movie_titles.astype(dtype = {'id': np.int32, 'year': np.int32})
print("----- Campos NaN en columna año -----\n", movie_titles[movie_titles['year'].isna()])

----- Campos NaN en columna año -----
 Empty DataFrame
Columns: [id, year, name]
Index: []


#### Lectura de las calificaciones para cada película

In [7]:

ratings_folder = 'c:\\umayor\\training_set'
files = ['\\mv_{}.txt'.format(str(f).zfill(7)) for f in range(1,17771)]
df_list = []

for filename in tqdm(files):
    temp = pd.read_csv(ratings_folder + filename, names = ['user', 'rating', 'year'], skiprows = 1).astype(dtype = {'user': np.single, 'rating': np.single}).drop(columns = ['year'])
    temp['movie'] = filename.split('.')[0].split('_')[1].lstrip('0')

    interactions = (temp.groupby(['user', 'movie'])['rating']
      .sum()
      .unstack()
      #.reset_index()
      #.fillna(0)
      #.set_index('user')
    )
    #print(interactions)
    df_list.append(interactions)

100%|██████████| 17770/17770 [02:29<00:00, 118.56it/s]


#### Almacenamiento en una matriz pivote

In [8]:
pivot_df = pd.DataFrame()
pivot_df = pd.concat(df_list, axis=1)
pivot_df = pivot_df.fillna(0)
pivot_df

movie,1,2,3,4,5,6,7,8,9,10,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649404.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649409.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649421.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649426.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
pivot_df.to_pickle('ratings_df.pkl')

In [16]:
pivot_df

movie,1,2,3,4,5,6,7,8,9,10,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649404.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649409.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649421.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649426.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Generación de matriz dispersa con compresión de los 0 por fila

In [17]:
sparse_matrix = sparse.csr_matrix(pivot_df.to_numpy())
del pivot_df
train_test = random_train_test_split(sparse_matrix, test_percentage=0.2)
train = train_test[0]
test = train_test[1]

#### Selección del modelo y entrenamiento

In [18]:
model = LightFM(loss='warp')
model.fit(train, epochs=30, verbose=True)

<lightfm.lightfm.LightFM at 0x1998677f7c0>

https://stackoverflow.com/questions/45451161/evaluating-the-lightfm-recommendation-model/45466481#45466481

In [19]:
train_precision = precision_at_k(model, train, k=5).mean()
test_precision = precision_at_k(model, train, test, k=5).mean()
train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

In [20]:
print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.49, test 0.57.
AUC: train 0.97, test 0.97.


In [22]:
suggested_movies = model.predict(915, [x for x in range(1,17770)])
suggested_movies

array([-3.4648578, -1.7749546, -3.0492978, ..., -4.08479  , -1.1053587,
       -4.1838884], dtype=float32)