In [1]:
# CONECTAR CON DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys
path ='/content/drive/MyDrive/cod/LEA3_Marketing'
os.chdir(path) ## volver la carpeta de repositorio directorio de trabajo
sys.path.append(path) ## agregarla al path, para leer archivos propios como paquetes

In [2]:
import numpy as np
import pandas as pd
import sqlite3 as sql
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact ## para análisis interactivo
from sklearn import neighbors ### basado en contenido un solo producto consumido
import joblib
#### conectar_base_de_Datos
#!pip install ipywidgets

# CREAR CONEXIÓN CON LA BASE DE DATOS db_movies
con = sql.connect('data/db_movies')

# CREAR EL CURSOR
cur = con.cursor() ## se crea el cursor, que es el otro tipo de conexión para ejecutar las consultas

In [5]:
# VERIFICAR LOS NOMBRES DE TODAS LAS TABLAS QUE HAY EN LA BASE DE DATOS
cur.execute(""" select name from sqlite_master where type= 'table'  """)
cur.fetchall()

[('ratings',),
 ('movies',),
 ('usuarios_selectos',),
 ('Pelis_selectas',),
 ('ratings_final',),
 ('movies_final',),
 ('full_ratings',)]

In [6]:
db_movies = pd.read_sql('SELECT * FROM movies', con)
df_delete = pd.read_sql(
    """SELECT * FROM movies
    LEFT JOIN ratings ON movies.movieId = ratings.movieId
    WHERE ratings.rating IS NULL""", con)

In [7]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from mlxtend.preprocessing import TransactionEncoder

# Paso 0: df_delete ya obtenido previamente con SQL

# Función 1: separar géneros y convertir a binario con TransactionEncoder
def split_and_encode_genres(df):
    genres = df['genres'].str.split('|')
    te = TransactionEncoder()
    genres_bin = te.fit_transform(genres)
    genres_df = pd.DataFrame(genres_bin, columns=te.columns_)

    # Eliminar "(no genres listed)" si existe
    if '(no genres listed)' in genres_df.columns:
        valid_rows = ~genres_df['(no genres listed)'] # La virgulilla me convierte lo TRUE en FALSE y viceversa
        df = df.loc[valid_rows].reset_index(drop=True) # Filtro por las columnas que si tienen genero
        genres_df = genres_df.loc[valid_rows].drop(columns='(no genres listed)').reset_index(drop=True)

    # Eliminar columna original 'genres' y unir los géneros codificados
    df = df.drop(columns='genres').reset_index(drop=True) # Elimina la columna original 'genres' del df
    return pd.concat([df, genres_df], axis=1)

# Función 2: eliminar registros que estén en df_delete
def remove_unrated_movies(df):
    return df[~df['movieId'].isin(df_delete['movieId'])].reset_index(drop=True)

# Función 3: extraer título y año
def extract_title_and_year(df):
    year = df['title'].str.extract(r'\((\d{4})\)$')
    year.columns = ['year']
    title = df['title'].str.replace(r'\s*\(\d{4}\)$', '', regex=True)
    title.name = 'title'
    df = df.drop(columns='title')
    df = pd.concat([df.reset_index(drop=True), title.reset_index(drop=True), year.reset_index(drop=True)], axis=1)
    return df

# Función 4: eliminar registros con year == NaN
def remove_nan_years(df):
    return df[df['year'].notna()].reset_index(drop=True)

# Función 5: Reordenar columnas para que 'title' y 'year' estén después de 'movieId'
def reorder_columns(df):
    cols = list(df.columns)
    if 'movieId' in cols and 'title' in cols and 'year' in cols:
        cols.remove('title')
        cols.remove('year')
        insert_pos = cols.index('movieId') + 1
        cols[insert_pos:insert_pos] = ['title', 'year']
    return df[cols]

# Construcción del pipeline
pipeline = Pipeline(steps=[
    ('genres_transform', FunctionTransformer(split_and_encode_genres, validate=False)),
    ('remove_unrated', FunctionTransformer(remove_unrated_movies, validate=False)),
    ('extract_title_year', FunctionTransformer(extract_title_and_year, validate=False)),
    ('remove_nan_years', FunctionTransformer(remove_nan_years, validate=False)),
    ('reorder_columns', FunctionTransformer(reorder_columns, validate=False))  # Nuevo paso
])

# Aplicar el pipeline
db_movies_final = pipeline.fit_transform(db_movies)

In [8]:
db_movies_final


Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji,1995,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men,1995,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
3,4,Waiting to Exhale,1995,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,5,Father of the Bride Part II,1995,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9688,193581,Black Butler: Book of the Atlantic,2017,True,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
9689,193583,No Game No Life: Zero,2017,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
9690,193585,Flint,2017,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9691,193587,Bungo Stray Dogs: Dead Apple,2018,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
joblib.dump(db_movies_final,"Salidas\\db_movies_final.joblib") ### para utilizar en segundos modelos


['Salidas\\db_movies_final.joblib']

In [39]:
db_movies_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9693 entries, 0 to 9692
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieId      9693 non-null   int64 
 1   title        9693 non-null   object
 2   year         9693 non-null   object
 3   Action       9693 non-null   bool  
 4   Adventure    9693 non-null   bool  
 5   Animation    9693 non-null   bool  
 6   Children     9693 non-null   bool  
 7   Comedy       9693 non-null   bool  
 8   Crime        9693 non-null   bool  
 9   Documentary  9693 non-null   bool  
 10  Drama        9693 non-null   bool  
 11  Fantasy      9693 non-null   bool  
 12  Film-Noir    9693 non-null   bool  
 13  Horror       9693 non-null   bool  
 14  IMAX         9693 non-null   bool  
 15  Musical      9693 non-null   bool  
 16  Mystery      9693 non-null   bool  
 17  Romance      9693 non-null   bool  
 18  Sci-Fi       9693 non-null   bool  
 19  Thriller     9693 non-null 

In [4]:
# Ajustar el ancho máximo de las columnas
pd.set_option('display.max_columns', None)  # Muestra todas las columnas
pd.set_option('display.width', 1000)  # Aumenta el ancho de la línea

In [31]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('ratings',), ('movies',), ('db_movies',), ('db_movies_final',)]


## TOP 10 Peliculas con mejores calificaciones y mas calificadas

###  Se usa pesos para dar a los mas calificados mejor puntuacion asi como a los mejor calificados mas puntuacion, entonces una pelicula con muchas calificaciones y bien calificadas tendra gran oportunidad de hacer parte de este top 10 de popularidad

In [14]:
# Calcular el promedio de calificación (C) y el percentil 75 de cantidad de calificaciones (m)

# Primero, el conteo de rantings
rating_counts_df = pd.read_sql("""
    SELECT COUNT(*) AS rating_count
    FROM ratings
    GROUP BY movieId
""", con)

# Calificacion promedio / rating promedio
avg_rating_df = pd.read_sql("""
    SELECT AVG(rating) AS C
    FROM ratings
""", con)

# Extraccion de calificación promedio
C = avg_rating_df['C'].iloc[0]

# calculo del percentil 75.
m = rating_counts_df['rating_count'].quantile(0.75)

print(f"Rating promedio (C): {C}")
print(f"percentil 75 del conteo de calificaciones (m): {m}")


Rating promedio (C): 3.501556983616962
percentil 75 del conteo de calificaciones (m): 9.0


In [26]:
## se prepara el query donde  calcula la calificacion ponderada dando uso a los productos mas calificados y con mejor calificacion.
query = f"""
    SELECT m.title,
           AVG(r.rating) AS avg_rating,
           COUNT(*) AS rating_count,
           -- Calcular rating con peso por vista y calificación
           ((COUNT(*) / (COUNT(*) + {m})) * AVG(r.rating) +
           ({m} / (COUNT(*) + {m})) * {C}) AS weighted_rating
    FROM ratings r
    JOIN db_movies_final m ON r.movieId = m.movieId
    WHERE r.rating > 0
    GROUP BY m.title
    HAVING rating_count > 30
    ORDER BY weighted_rating DESC
    LIMIT 10
"""
df = pd.read_sql(query, con)
df

Unnamed: 0,title,avg_rating,rating_count,weighted_rating
0,"Shawshank Redemption, The",4.429022,317,4.403417
1,"Godfather, The",4.289062,192,4.253801
2,Fight Club,4.272936,218,4.242352
3,"Godfather: Part II, The",4.25969,129,4.210246
4,"Usual Suspects, The",4.237745,204,4.206639
5,Star Wars: Episode IV - A New Hope,4.231076,251,4.205823
6,Dr. Strangelove or: How I Learned to Stop Worr...,4.268041,97,4.202962
7,Goodfellas,4.25,126,4.200104
8,Schindler's List,4.225,220,4.196568
9,"Dark Knight, The",4.238255,149,4.196291


In [None]:
## Pendiente de revisar ya sea mejor almacenando la consulta en un dataframe para manipular

#### los mejores calificados por año publicacion ###
pd.read_sql("""
  SELECT
      SUBSTR(m.title, LENGTH(m.title) - 5, 6) AS movie_year,  -- Extrae las últimas 4 cifras (año)
      SUBSTR(m.title, 1, LENGTH(m.title) - 7) AS movie_name,  -- Extrae el nombre quitando el año
      MAX(r.rating) AS best_rating
  FROM movies m
  JOIN ratings r ON m.movieId = r.movieId
  GROUP BY movie_year
  ORDER BY movie_year;"""
    ,con)

In [29]:
pd.read_sql("""
    SELECT
        m.year AS movie_year,
        m.title AS movie_name,
        MAX(r.rating) AS best_rating
    FROM db_movies_final m
    JOIN ratings r ON m.movieId = r.movieId
    GROUP BY movie_year
    ORDER BY movie_year;
""", con)

Unnamed: 0,movie_year,movie_name,best_rating
0,1902,"Trip to the Moon, A (Voyage dans la lune, Le)",4.5
1,1903,The Great Train Robbery,4.0
2,1908,The Electric Hotel,4.0
3,1915,"Birth of a Nation, The",2.0
4,1916,"20,000 Leagues Under the Sea",4.0
...,...,...,...
101,2014,Divergent,5.0
102,2015,Mission: Impossible - Rogue Nation,5.0
103,2016,Deadpool,5.0
104,2017,Black Panther,5.0


In [None]:
movies=pd.read_sql("""select * from movies""", con)


In [None]:
sep=movies['title'].str.split('(')

year=sep.str[1].str.replace(')','')
title=sep.str[0]

In [None]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)$')
movies['title_clean'] = movies['title'].str.replace(r'\s*\(\d{4}\)$', '', regex=True)

In [None]:
movies.drop(columns=['title'], inplace=True)
movies.rename(columns={'title_clean': 'title'}, inplace=True)

In [None]:
movies

Unnamed: 0,movieId,genres,year,title
0,1,Adventure|Animation|Children|Comedy|Fantasy,1995,Toy Story
1,2,Adventure|Children|Fantasy,1995,Jumanji
2,3,Comedy|Romance,1995,Grumpier Old Men
3,4,Comedy|Drama|Romance,1995,Waiting to Exhale
4,5,Comedy,1995,Father of the Bride Part II
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,2017,Black Butler: Book of the Atlantic
9738,193583,Animation|Comedy|Fantasy,2017,No Game No Life: Zero
9739,193585,Drama,2017,Flint
9740,193587,Action|Animation,2018,Bungo Stray Dogs: Dead Apple


In [None]:
#######################################################################
######## 2.1 Sistema de recomendación basado en contenido un solo producto - Manual ########
#######################################################################

movies_2=pd.read_sql('select * from movies_final', con )

movies_2.info()
movies_2['año_estreno']=movies_2.año_estreno.astype('int')
movies_2.info()

##### escalar para que año esté en el mismo rango ###

sc=MinMaxScaler()
movies_2[["year_sc"]]=sc.fit_transform(movies_2[['year_pub']])



## eliminar filas que no se van a utilizar ###

movies_2_dum1=movies_2.drop(columns=['isbn','i_url','year_pub','book_title']) ## ingresar las columnas a quitar ###

#### convertir a dummies

movies_2_dum1['book_author'].nunique()
movies_2_dum1['publisher'].nunique()

col_dum=['book_author','publisher']
movies_2_dum2=pd.get_dummies(movies_2_dum1,columns=col_dum)
movies_2_dum2.shape

joblib.dump(movies_2_dum2,"salidas\\movies_2_dum2.joblib") ### para utilizar en segundos modelos


