In [1]:
import pandas as pd
import numpy as np
import pickle
import logging
import plotly.express as px
from sklearn.model_selection import train_test_split
from IPython.display import display
from EDA_functions import *
from cluster import cluster_movies

In [2]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

In [3]:
import logging
logging_format = '%(asctime)s %(levelname)s: %(message)s'
logging.basicConfig(format=logging_format, datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)

In [4]:
seed=50000

# Reconocimiento de Datos 

De acuerdo a la documentación disponible de esta base de datos vía: https://files.grouplens.org/datasets/movielens/ml-20m-README.html se llega a que:
* *links.cv* no se usará debido a que solo son indentificadores de las películas en las páginas.
* *genome-scores.csv* y *genome-tags.csv*. La columna que ofrecería cierta información razonable sería `relavance`, pero, segun la descripción ofrecida *"the tag genome was computed using a machine learning algorithm on user-contributed content including tags, ratings, and textual reviews"*. Se entiende que ha utilizado información de toda la data del conjunto, por lo que considerar estos scores en el entrenamieto y prueba estaría provocando fuga de información (leakege) de uno a otro conjunto. Entonces, no se usarán estos scores.  

Se cargan las datas de interés y se hacen una serie de depslieques de los primeras filas de cada una para reconocer el contenido.

In [5]:
movie = pd.read_csv('movie.csv')
rating = pd.read_csv('rating.csv')
tag = pd.read_csv('tag.csv')

In [6]:
print("BASE DE DATOS: movie")
display(movie.head(3))
display(movie.shape)
display(movie.isna().sum())
print("BASE DE DATOS: rating")
display(rating.head(3))
display(rating.shape)
display(rating.isna().sum())
print("BASE DE DATOS: tag")
display(tag.head(3))
display(tag.shape)
display(tag.isna().sum())

BASE DE DATOS: movie


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


(27278, 3)

movieId    0
title      0
genres     0
dtype: int64

BASE DE DATOS: rating


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39


(20000263, 4)

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

BASE DE DATOS: tag


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19


(465564, 4)

userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

In [None]:
# info_user(data=rating)

Desplegamos información acerca de los 138,493 __usuario__.
* Todos ellos han revisado, como mínimo 20 películas.
* Uno de ellos ha revisado 9,254 películas.  
* Sólo 7,801 han dejado dejado algún tipo de tags en las películas.

Ahora desplegamos unos insights para las 27,278 __películas__ que se enlistan en la data *movies*.
* 19,545 tiene algún tipo de token (palabra relevante en los tags).
* Solo aparecen 26,744 en la data *rating*, de las cuales:

    1) 3,972 solo tiene 1 reseña. \
    2) 2,043 solo tiene 2 reseñas. \
    3) 1,355 solo tiene 3 reseñas. \
    ... \
    10) 372 solo tiene 10 reseñas. \
    ... 

Hora bien, dividiremosa los __usuarios__ de una vez en conjuntos de entrenamiento y prueba.

In [7]:
total_user = rating[['userId']].drop_duplicates()
total_user.reset_index(drop=True,
                       inplace=True)
user_train, user_test = train_test_split(total_user,
                                         test_size=0.30,
                                         random_state=seed,
                                         shuffle=True)

Con el fin de no saturar los recursos computaciones que tenemos a disposición. Tomaremos el 50% de cada uno de esos conjuntos.

In [8]:
user_train_1, user_train_2 = train_test_split(user_train,
                                              test_size=0.50,
                                              random_state=seed,
                                              shuffle=True)
user_test_1, user_test_2 = train_test_split(user_test,
                                            test_size=0.50,
                                            random_state=seed,
                                            shuffle=True)

In [9]:
list_user_test_1=list(user_test_1['userId'])
list_user_train_2=list(user_train_2['userId'])

In [10]:
rating_redundant_test_1 = rating[rating['userId'].isin(list_user_test_1)]
rating_redundant_train_2 = rating[rating['userId'].isin(list_user_train_2)]

In [11]:
rating_redundant_test_1.sort_values(by='timestamp',
                                    ascending=False,
                                    inplace=True)
rating_redundant_test_1.reset_index(drop=True,
                                    inplace=True)

rating_redundant_train_2.sort_values(by='timestamp',
                                     ascending=False,
                                     inplace=True)
rating_redundant_train_2.reset_index(drop=True,
                                     inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [12]:
rating_redundant_test_1

Unnamed: 0,userId,movieId,rating,timestamp
0,89081,52458,4.0,2015-03-31 06:11:28
1,89081,55232,3.5,2015-03-31 06:11:26
2,107073,2959,3.0,2015-03-31 04:47:08
3,107073,527,5.0,2015-03-31 04:46:43
4,107073,3996,2.5,2015-03-31 04:46:13
...,...,...,...,...
3014372,91454,10,3.0,1996-02-19 14:49:12
3014373,91454,50,5.0,1996-02-19 14:49:12
3014374,91454,21,5.0,1996-02-19 14:49:09
3014375,91454,19,3.0,1996-02-19 14:49:08


In [13]:
rating_redundant_train_2

Unnamed: 0,userId,movieId,rating,timestamp
0,87586,7151,3.5,2015-03-31 06:40:02
1,16978,2093,3.5,2015-03-31 06:03:17
2,53930,118706,3.5,2015-03-31 06:00:51
3,16978,106642,3.0,2015-03-31 06:00:28
4,70232,58998,2.5,2015-03-31 05:55:28
...,...,...,...,...
6985821,85252,48,4.0,1996-01-29 00:00:00
6985822,85252,50,5.0,1996-01-29 00:00:00
6985823,85252,60,4.0,1996-01-29 00:00:00
6985824,85252,70,4.0,1996-01-29 00:00:00


In [14]:
len(rating_redundant_test_1['userId'].unique())

20774

In [15]:
len(rating_redundant_train_2['userId'].unique())

48473

In [16]:
len(rating_redundant_test_1['movieId'].unique())

20132

In [17]:
len(rating_redundant_train_2['movieId'].unique())

22540

In [85]:
path_test = 'rating_redundant_test_1.sav'
pickle.dump(rating_redundant_test_1, open(path_test, 'wb'))
path_train = 'rating_redundant_train_2.sav'
pickle.dump(rating_redundant_train_2, open(path_train, 'wb'))
# path_test = 'rating_redundant_test.sav'
# rating_redundant_test = pickle.load(open(path_test, 'rb'))
# path_train = 'rating_redundant_train.sav'
# rating_redundant_train = pickle.load(open(path_train, 'rb'))

# Codificación de la variable de interés.

In [19]:
logging.info('SE CODIFICA VARIABLE OBJETIVO.')
rating_redundant_test_1['high_rating'] = rating_redundant_test_1.apply(lambda row: 1 if row['rating'] >= 4 else 0,
                                                                       axis=1)
rating_redundant_train_2['high_rating'] = rating_redundant_train_2.apply(lambda row: 1 if row['rating'] >= 4 else 0,
                                                                         axis=1)
logging.info('¡LISTO!')      

2021-11-26 20:45:47 INFO: SE CODIFICA VARIABLE OBJETIVO.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_redundant_test_1['high_rating'] = rating_redundant_test_1.apply(lambda row: 1 if row['rating'] >= 4 else 0,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_redundant_train_2['high_rating'] = rating_redundant_train_2.apply(lambda row: 1 if row['rating'] >= 4 else 0,
2021-11-26 20:46:58 INFO: ¡LISTO!


In [20]:
logging.info('SE OBTIENE FECHA y AÑO.')
rating_redundant_test_1['time_day'] = rating_redundant_test_1.apply(lambda row: row['timestamp'].split()[0], axis=1)
rating_redundant_test_1['time_day'] = pd.to_datetime(rating_redundant_test_1['time_day'])
rating_redundant_test_1['timestamp'] = pd.to_datetime(rating_redundant_test_1['timestamp'])
rating_redundant_test_1['year'] = pd.DatetimeIndex(rating_redundant_test_1['time_day']).year
rating_redundant_train_2['time_day'] = rating_redundant_train_2.apply(lambda row: row['timestamp'].split()[0], axis=1)
rating_redundant_train_2['time_day'] = pd.to_datetime(rating_redundant_train_2['time_day'])
rating_redundant_train_2['timestamp'] = pd.to_datetime(rating_redundant_train_2['timestamp'])
rating_redundant_train_2['year'] = pd.DatetimeIndex(rating_redundant_train_2['time_day']).year
# rating['time_age'] = rating.apply(lambda row: row['timestamp'].split()[0].replace("-", " ").split()[0], axis=1)
logging.info('¡LISTO!')

2021-11-26 20:47:32 INFO: SE OBTIENE FECHA y AÑO.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_redundant_test_1['time_day'] = rating_redundant_test_1.apply(lambda row: row['timestamp'].split()[0], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_redundant_test_1['time_day'] = pd.to_datetime(rating_redundant_test_1['time_day'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [21]:
rating_redundant_train_2

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year
0,87586,7151,3.5,2015-03-31 06:40:02,0,2015-03-31,2015
1,16978,2093,3.5,2015-03-31 06:03:17,0,2015-03-31,2015
2,53930,118706,3.5,2015-03-31 06:00:51,0,2015-03-31,2015
3,16978,106642,3.0,2015-03-31 06:00:28,0,2015-03-31,2015
4,70232,58998,2.5,2015-03-31 05:55:28,0,2015-03-31,2015
...,...,...,...,...,...,...,...
6985821,85252,48,4.0,1996-01-29 00:00:00,1,1996-01-29,1996
6985822,85252,50,5.0,1996-01-29 00:00:00,1,1996-01-29,1996
6985823,85252,60,4.0,1996-01-29 00:00:00,1,1996-01-29,1996
6985824,85252,70,4.0,1996-01-29 00:00:00,1,1996-01-29,1996


In [22]:
rating_redundant_test_1

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year
0,89081,52458,4.0,2015-03-31 06:11:28,1,2015-03-31,2015
1,89081,55232,3.5,2015-03-31 06:11:26,0,2015-03-31,2015
2,107073,2959,3.0,2015-03-31 04:47:08,0,2015-03-31,2015
3,107073,527,5.0,2015-03-31 04:46:43,1,2015-03-31,2015
4,107073,3996,2.5,2015-03-31 04:46:13,0,2015-03-31,2015
...,...,...,...,...,...,...,...
3014372,91454,10,3.0,1996-02-19 14:49:12,0,1996-02-19,1996
3014373,91454,50,5.0,1996-02-19 14:49:12,1,1996-02-19,1996
3014374,91454,21,5.0,1996-02-19 14:49:09,1,1996-02-19,1996
3014375,91454,19,3.0,1996-02-19 14:49:08,0,1996-02-19,1996


Observemos el siguiente ejemplo de usuario que ha caído en el conjunto de entrenmaiento.

In [24]:
ind_userId=85252
data_ind_user_Id = rating_redundant_train_2[rating_redundant_train_2['userId']==ind_userId]
count_movies_ind_userId = data_ind_user_Id.shape[0]
display(data_ind_user_Id.head(10))
logging.info(f'EN TOTAL EL USUARIO {ind_userId} TIENE {count_movies_ind_userId}.')

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year
6443295,85252,1391,3.0,1996-12-14 14:26:30,0,1996-12-14,1996
6450988,85252,475,3.0,1996-12-11 12:00:43,0,1996-12-11,1996
6456484,85252,481,3.0,1996-12-08 14:16:09,0,1996-12-08,1996
6456485,85252,1374,3.0,1996-12-08 14:15:30,0,1996-12-08,1996
6463953,85252,1183,4.0,1996-12-03 15:04:29,1,1996-12-03,1996
6466285,85252,1367,3.0,1996-11-30 19:35:06,0,1996-11-30,1996
6482524,85252,1356,4.0,1996-11-23 19:48:04,1,1996-11-23,1996
6494140,85252,673,3.0,1996-11-18 14:26:26,0,1996-11-18,1996
6507330,85252,832,5.0,1996-11-10 11:44:09,1,1996-11-10,1996
6512864,85252,1059,4.0,1996-11-08 10:15:32,1,1996-11-08,1996


2021-11-26 20:50:32 INFO: EN TOTAL EL USUARIO 85252 TIENE 198.


Este pasado usuario ha reseñado en total 57 películas (aquí se muestran las primeras 10). Sin embargo, hay películas "muy similares entre sí", por lo que esta manera de usar la información podría dar cierta redundancia que podría incidir en el sobreajuste del modelo.

Siendo así, se agruparán las películas que han quedado en el conjunto de entrenamiento,

Este modo se tratar la data asegura no tener comportamiento de usuario heredados del conjunto de entrenamiento. Tambien se tiene lo siguiente

In [25]:
movies_in_test = list(rating_redundant_test_1['movieId'].unique())
movies_in_train = list(rating_redundant_train_2['movieId'].unique())
count_not_movies_in_train = len(set(movies_in_test) - set(movies_in_train))
logging.info(f'EN TOTAL HAY {len(movies_in_test)} PELÍCULAS EN EL CONJUNTO DE USUARIOS-TEST.')
logging.info(f'EN TOTAL HAY {len(movies_in_train)} PELÍCULAS EN EL CONJUNTO DE USUARIOS-TRAIN.')
logging.info(f'EN TOTAL HAY {count_not_movies_in_train} PELÍCULAS QUE NO ESTÁN EN EL CONJUNTO USUARIOS-TRAIN.')

2021-11-26 20:51:21 INFO: EN TOTAL HAY 20132 PELÍCULAS EN EL CONJUNTO DE USUARIOS-TEST.
2021-11-26 20:51:21 INFO: EN TOTAL HAY 22540 PELÍCULAS EN EL CONJUNTO DE USUARIOS-TRAIN.
2021-11-26 20:51:21 INFO: EN TOTAL HAY 1817 PELÍCULAS QUE NO ESTÁN EN EL CONJUNTO USUARIOS-TRAIN.


En en esta intersección de conjuntos de películas y usuarios que "no verá" el modelo donde debemos interesarnos en el comportamimiento del modelo.

# Clusterización de las películas 

In [26]:
interest_columns = ['movieId', 'genre_film-noir',
                    'genre_no genres listed', 'genre_drama',
                    'genre_mystery', 'genre_animation',
                    'genre_horror', 'genre_fantasy',
                    'genre_war', 'genre_crime', 'genre_comedy',
                    'genre_western', 'genre_adventure',
                    'genre_documentary', 'genre_imax',
                    'genre_action', 'genre_children',
                    'genre_musical', 'genre_thriller',
                    'genre_romance', 'genre_sci-fi'] + ['cluster']

In [27]:
movie_with_cluster = cluster_movies(data_movie=movie,
                                    n_clusters=15)

In [84]:
data_path='movie_with_cluster.sav'
pickle.dump(movie_with_cluster, open(data_path, 'wb'))

# Obtención de la base de entrenamiento definitiva

In [29]:
rating_train_2 = pd.merge(rating_redundant_train_2,
                          movie_with_cluster,
                          how="left",
                          on=["movieId"])
rating_train_2.reset_index(drop=True,
                           inplace=True)

In [30]:
rating_train_2

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year,title,genres,genres_list,...,genre_musical,genre_sci-fi,genre_animation,genre_fantasy,genre_children,genre_western,genre_war,genre_mystery,genre_film-noir,cluster
0,87586,7151,3.5,2015-03-31 06:40:02,0,2015-03-31,2015,Girl with a Pearl Earring (2003),Drama|Romance,"[drama, romance]",...,0,0,0,0,0,0,0,0,0,5
1,16978,2093,3.5,2015-03-31 06:03:17,0,2015-03-31,2015,Return to Oz (1985),Adventure|Children|Fantasy,"[adventure, children, fantasy]",...,0,0,0,1,1,0,0,0,0,1
2,53930,118706,3.5,2015-03-31 06:00:51,0,2015-03-31,2015,Black Sea (2014),Adventure|Thriller,"[adventure, thriller]",...,0,0,0,0,0,0,0,0,0,4
3,16978,106642,3.0,2015-03-31 06:00:28,0,2015-03-31,2015,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,"[adventure, drama, sci-fi]",...,0,1,0,0,0,0,0,0,0,4
4,70232,58998,2.5,2015-03-31 05:55:28,0,2015-03-31,2015,Forgetting Sarah Marshall (2008),Comedy|Romance,"[comedy, romance]",...,0,0,0,0,0,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6985821,85252,48,4.0,1996-01-29 00:00:00,1,1996-01-29,1996,Pocahontas (1995),Animation|Children|Drama|Musical|Romance,"[animation, children, drama, musical, romance]",...,1,0,1,0,1,0,0,0,0,1
6985822,85252,50,5.0,1996-01-29 00:00:00,1,1996-01-29,1996,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,"[crime, mystery, thriller]",...,0,0,0,0,0,0,0,1,0,7
6985823,85252,60,4.0,1996-01-29 00:00:00,1,1996-01-29,1996,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy,"[adventure, children, fantasy]",...,0,0,0,1,1,0,0,0,0,1
6985824,85252,70,4.0,1996-01-29 00:00:00,1,1996-01-29,1996,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,"[action, comedy, horror, thriller]",...,0,0,0,0,0,0,0,0,0,2


In [31]:
rating_train_whithout_duplicate = rating_train_2.drop_duplicates(subset=['userId', 'high_rating', 'cluster'])

In [32]:
rating_train_whithout_duplicate

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year,title,genres,genres_list,...,genre_musical,genre_sci-fi,genre_animation,genre_fantasy,genre_children,genre_western,genre_war,genre_mystery,genre_film-noir,cluster
0,87586,7151,3.5,2015-03-31 06:40:02,0,2015-03-31,2015,Girl with a Pearl Earring (2003),Drama|Romance,"[drama, romance]",...,0,0,0,0,0,0,0,0,0,5
1,16978,2093,3.5,2015-03-31 06:03:17,0,2015-03-31,2015,Return to Oz (1985),Adventure|Children|Fantasy,"[adventure, children, fantasy]",...,0,0,0,1,1,0,0,0,0,1
2,53930,118706,3.5,2015-03-31 06:00:51,0,2015-03-31,2015,Black Sea (2014),Adventure|Thriller,"[adventure, thriller]",...,0,0,0,0,0,0,0,0,0,4
3,16978,106642,3.0,2015-03-31 06:00:28,0,2015-03-31,2015,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,"[adventure, drama, sci-fi]",...,0,1,0,0,0,0,0,0,0,4
4,70232,58998,2.5,2015-03-31 05:55:28,0,2015-03-31,2015,Forgetting Sarah Marshall (2008),Comedy|Romance,"[comedy, romance]",...,0,0,0,0,0,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6985753,124035,73,4.0,1996-02-01 14:34:07,1,1996-02-01,1996,"Misérables, Les (1995)",Drama|War,"[drama, war]",...,0,0,0,0,0,0,1,0,0,10
6985763,124035,24,3.0,1996-02-01 14:33:54,0,1996-02-01,1996,Powder (1995),Drama|Sci-Fi,"[drama, sci-fi]",...,0,1,0,0,0,0,0,0,0,4
6985774,124035,22,4.0,1996-02-01 14:33:44,1,1996-02-01,1996,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller,"[crime, drama, horror, mystery, thriller]",...,0,0,0,0,0,0,0,1,0,7
6985784,124035,61,4.0,1996-02-01 14:33:34,1,1996-02-01,1996,Eye for an Eye (1996),Drama|Thriller,"[drama, thriller]",...,0,0,0,0,0,0,0,0,0,0


In [33]:
data_ind = rating_train_whithout_duplicate[rating_train_whithout_duplicate['userId']==87586][['userId', 'timestamp', 'movieId', 'rating', 'high_rating', 'title', 'genres', 'genres_list', 'cluster']]
data_ind

Unnamed: 0,userId,timestamp,movieId,rating,high_rating,title,genres,genres_list,cluster
0,87586,2015-03-31 06:40:02,7151,3.5,0,Girl with a Pearl Earring (2003),Drama|Romance,"[drama, romance]",5
1039,87586,2015-03-30 04:13:43,92259,4.5,1,Intouchables (2011),Comedy|Drama,"[comedy, drama]",12
1041,87586,2015-03-30 04:11:36,97304,4.5,1,Argo (2012),Drama|Thriller,"[drama, thriller]",0
1042,87586,2015-03-30 04:11:16,104841,5.0,1,Gravity (2013),Action|Sci-Fi|IMAX,"[action, sci-fi, imax]",2
1043,87586,2015-03-30 04:10:54,107141,3.5,0,Saving Mr. Banks (2013),Comedy|Drama,"[comedy, drama]",12
1044,87586,2015-03-30 04:08:07,64957,4.0,1,"Curious Case of Benjamin Button, The (2008)",Drama|Fantasy|Mystery|Romance,"[drama, fantasy, mystery, romance]",13
1045,87586,2015-03-30 04:07:48,80463,4.5,1,"Social Network, The (2010)",Drama,[drama],9
1046,87586,2015-03-30 04:07:20,55247,4.5,1,Into the Wild (2007),Action|Adventure|Drama,"[action, adventure, drama]",1
1047,87586,2015-03-30 04:06:33,106782,4.0,1,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,"[comedy, crime, drama]",3
1048,87586,2015-03-30 04:05:17,64614,4.5,1,Gran Torino (2008),Crime|Drama,"[crime, drama]",7


In [34]:
data_ind['cluster'].value_counts()

5     2
12    2
0     2
2     2
13    2
9     2
1     2
3     2
7     2
4     2
10    2
11    2
6     2
8     2
14    2
Name: cluster, dtype: int64

In [40]:
data_ind[data_ind['cluster']==11]

Unnamed: 0,userId,timestamp,movieId,rating,high_rating,title,genres,genres_list,cluster
2637,87586,2015-03-29 03:59:08,113938,4.0,1,Nixon by Nixon: In His Own Words (2014),Documentary,[documentary],11
2649,87586,2015-03-29 03:49:37,117533,1.0,0,Citizenfour (2014),Documentary,[documentary],11


In [41]:
rating_train_whithout_duplicate

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year,title,genres,genres_list,...,genre_musical,genre_sci-fi,genre_animation,genre_fantasy,genre_children,genre_western,genre_war,genre_mystery,genre_film-noir,cluster
0,87586,7151,3.5,2015-03-31 06:40:02,0,2015-03-31,2015,Girl with a Pearl Earring (2003),Drama|Romance,"[drama, romance]",...,0,0,0,0,0,0,0,0,0,5
1,16978,2093,3.5,2015-03-31 06:03:17,0,2015-03-31,2015,Return to Oz (1985),Adventure|Children|Fantasy,"[adventure, children, fantasy]",...,0,0,0,1,1,0,0,0,0,1
2,53930,118706,3.5,2015-03-31 06:00:51,0,2015-03-31,2015,Black Sea (2014),Adventure|Thriller,"[adventure, thriller]",...,0,0,0,0,0,0,0,0,0,4
3,16978,106642,3.0,2015-03-31 06:00:28,0,2015-03-31,2015,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,"[adventure, drama, sci-fi]",...,0,1,0,0,0,0,0,0,0,4
4,70232,58998,2.5,2015-03-31 05:55:28,0,2015-03-31,2015,Forgetting Sarah Marshall (2008),Comedy|Romance,"[comedy, romance]",...,0,0,0,0,0,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6985753,124035,73,4.0,1996-02-01 14:34:07,1,1996-02-01,1996,"Misérables, Les (1995)",Drama|War,"[drama, war]",...,0,0,0,0,0,0,1,0,0,10
6985763,124035,24,3.0,1996-02-01 14:33:54,0,1996-02-01,1996,Powder (1995),Drama|Sci-Fi,"[drama, sci-fi]",...,0,1,0,0,0,0,0,0,0,4
6985774,124035,22,4.0,1996-02-01 14:33:44,1,1996-02-01,1996,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller,"[crime, drama, horror, mystery, thriller]",...,0,0,0,0,0,0,0,1,0,7
6985784,124035,61,4.0,1996-02-01 14:33:34,1,1996-02-01,1996,Eye for an Eye (1996),Drama|Thriller,"[drama, thriller]",...,0,0,0,0,0,0,0,0,0,0


In [82]:
data_path='rating_train_whithout_duplicate.sav'
pickle.dump(rating_train_whithout_duplicate, open(data_path, 'wb'))

# Obtención de la base de prueba definitiva

La base de datos definitva serán los usurios de `rating_redundant_train_2` cuyas películas no estén en la base de entrenamiento.

In [76]:
movies_not_in_train = list(set(movies_in_test) - set(movies_in_train))

In [77]:
user_for_movies_not_in_train = list(rating_redundant_test_1[rating_redundant_test_1['movieId'].isin(movies_not_in_train)]['userId'])

In [78]:
data_test = rating_redundant_test_1[rating_redundant_test_1['userId'].isin(user_for_movies_not_in_train)]

In [79]:
data_test

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year
0,89081,52458,4.0,2015-03-31 06:11:28,1,2015-03-31,2015
1,89081,55232,3.5,2015-03-31 06:11:26,0,2015-03-31,2015
22,30317,1927,5.0,2015-03-31 04:01:14,1,2015-03-31,2015
24,79366,45183,3.5,2015-03-31 03:39:41,0,2015-03-31,2015
25,102853,118924,3.0,2015-03-31 03:20:13,0,2015-03-31,2015
...,...,...,...,...,...,...,...
2682997,79366,32,4.0,1997-09-15 21:13:53,1,1997-09-15,1997
2682998,79366,800,4.0,1997-09-15 21:13:16,1,1997-09-15,1997
2682999,79366,36,4.0,1997-09-15 21:12:45,1,1997-09-15,1997
2683000,79366,260,5.0,1997-09-15 21:11:57,1,1997-09-15,1997


In [80]:
data_test_with_movies_not_in_train = pd.merge(data_test,
                                              movie_with_cluster,
                                              how="left",
                                              on=["movieId"])
data_test_with_movies_not_in_train.reset_index(drop=True,
                                               inplace=True)

In [81]:
data_test_with_movies_not_in_train

Unnamed: 0,userId,movieId,rating,timestamp,high_rating,time_day,year,title,genres,genres_list,...,genre_musical,genre_sci-fi,genre_animation,genre_fantasy,genre_children,genre_western,genre_war,genre_mystery,genre_film-noir,cluster
0,89081,52458,4.0,2015-03-31 06:11:28,1,2015-03-31,2015,Disturbia (2007),Drama|Thriller,"[drama, thriller]",...,0,0,0,0,0,0,0,0,0,0
1,89081,55232,3.5,2015-03-31 06:11:26,0,2015-03-31,2015,Resident Evil: Extinction (2007),Action|Horror|Sci-Fi|Thriller,"[action, horror, sci-fi, thriller]",...,0,1,0,0,0,0,0,0,0,2
2,30317,1927,5.0,2015-03-31 04:01:14,1,2015-03-31,2015,All Quiet on the Western Front (1930),Action|Drama|War,"[action, drama, war]",...,0,0,0,0,0,0,1,0,0,10
3,79366,45183,3.5,2015-03-31 03:39:41,0,2015-03-31,2015,"Protector, The (a.k.a. Warrior King) (Tom yum ...",Action|Comedy|Crime|Thriller,"[action, comedy, crime, thriller]",...,0,0,0,0,0,0,0,0,0,2
4,102853,118924,3.0,2015-03-31 03:20:13,0,2015-03-31,2015,Top Five (2014),Comedy,[comedy],...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251918,79366,32,4.0,1997-09-15 21:13:53,1,1997-09-15,1997,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,"[mystery, sci-fi, thriller]",...,0,1,0,0,0,0,0,1,0,0
251919,79366,800,4.0,1997-09-15 21:13:16,1,1997-09-15,1997,Lone Star (1996),Drama|Mystery|Western,"[drama, mystery, western]",...,0,0,0,0,0,1,0,1,0,8
251920,79366,36,4.0,1997-09-15 21:12:45,1,1997-09-15,1997,Dead Man Walking (1995),Crime|Drama,"[crime, drama]",...,0,0,0,0,0,0,0,0,0,7
251921,79366,260,5.0,1997-09-15 21:11:57,1,1997-09-15,1997,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,"[action, adventure, sci-fi]",...,0,1,0,0,0,0,0,0,0,1


In [83]:
data_path='data_test_with_movies_not_in_train.sav'
pickle.dump(data_test_with_movies_not_in_train, open(data_path, 'wb'))