## Cargamos las Librerías

In [1]:
import pandas as pd
import sqlalchemy as db
from sqlalchemy import text
import pandas as pd

## Iniciamos la Conexión a "db_movies_netflix_transact"

In [2]:
engine = db.create_engine("mysql://root:root@172.16.5.4:3310/db_movies_netflix_transact")
conn = engine.connect()

## Cargamos datos a la dimension Movie

Escribimos la query para la base de datos.

In [3]:
query = """
SELECT 
    movie.movieID as movieID, movie.movieTitle as title, movie.releaseDate as releaseDate, 
    gender.name as gender , person.name as participantName, participant.participantRole as roleparticipant 
FROM movie 
INNER JOIN participant 
ON movie.movieID=participant.movieID
INNER JOIN person
ON person.personID = participant.personID
INNER JOIN movie_gender 
ON movie.movieID = movie_gender.movieID
INNER JOIN gender 
ON movie_gender.genderID = gender.genderID
"""

Leemos la base de datos utilizando la conexión y la query anteriores.

In [4]:
movies_data=pd.read_sql(query, con=conn) 
movies_data.head()

Unnamed: 0,movieID,title,releaseDate,gender,participantName,roleparticipant
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director


A continuación modificamos el tipo de data de la columna "movieID" a int para facilitar la manipulación de la información.

In [5]:
movies_data["movieID"]=movies_data["movieID"].astype('int')
movies_data.head()

Unnamed: 0,movieID,title,releaseDate,gender,participantName,roleparticipant
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director


Para insertar información en la dimensión Movie, leemos un archivo externo "Awards_movie.csv" y arreglamos la información que contiene.

In [6]:
movies_award=pd.read_csv("./data/Awards_movie.csv")
movies_award["movieID"]=movies_award["movieID"].astype('int')
movies_award.rename(columns={"Aware":"Award"}, inplace=True)
movies_award.head()

Unnamed: 0,movieID,IdAward,Award
0,80210920,0,Oscar
1,81157374,1,Grammy
2,80192187,2,Oscar


Luego, creamos la tabla que será la dimensión Movie, dentro del data warehouse, haciendo un merge entre las tablas movies_data y movies_award.

In [7]:
movie_data=pd.merge(movies_data,movies_award, left_on="movieID", right_on="movieID")
movie_data.head()

Unnamed: 0,movieID,title,releaseDate,gender,participantName,roleparticipant,IdAward,Award
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor,2,Oscar
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor,0,Oscar
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director,1,Grammy


A continuación, cambiamos la conexión a la data warehouse "dw_netflix".

In [8]:
engine = db.create_engine("mysql://root:root@172.16.5.4:3310/dw_netflix")
conn = engine.connect()

Renombramos las llaves "releaseDate" y "Award" para que coincidan con la dimensión Movie del data warehouse.

In [9]:
movie_data = movie_data.rename(columns={'releaseDate': 'releaseMovie', 'Award': 'awardMovie'})
movie_data.head()

Unnamed: 0,movieID,title,releaseMovie,gender,participantName,roleparticipant,IdAward,awardMovie
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor,2,Oscar
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor,0,Oscar
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director,1,Grammy


Eliminamos una columna que no será necesaria.

In [10]:
movie_data = movie_data.drop(columns=['IdAward'])
movie_data.head()

Unnamed: 0,movieID,title,releaseMovie,gender,participantName,roleparticipant,awardMovie
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor,Oscar
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor,Oscar
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director,Grammy


Actualizamos la dimensión Movie de la data warehouse con la tabla que creamos "movie_data", haciendo uso del método .to_sql()

Donde los parámetros son: 
1) El nombre de la dimensión en el data warehouse "dimMovie"
2) La conexión conn
3) El parámetro if_exists="append" - Agrega la información de "movie_data" a "dimMovie", en el caso en el que ya exista dicha dimensión.
4) El parámetro index=False - Elimina la columna de índices, ya que no será necesaria.

In [11]:
movie_data.to_sql('dimMovie', conn, if_exists='append', index=False)

IntegrityError: (MySQLdb.IntegrityError) (1062, "Duplicate entry '80192187' for key 'dimMovie.PRIMARY'")
[SQL: INSERT INTO `dimMovie` (`movieID`, title, `releaseMovie`, gender, `participantName`, roleparticipant, `awardMovie`) VALUES (%s, %s, %s, %s, %s, %s, %s)]
[parameters: [(80192187, 'Triple Frontier', datetime.date(2019, 4, 12), 'Action', 'Joseph Chavez Pineda', 'Actor', 'Oscar'), (80210920, 'The Mother', datetime.date(2023, 1, 5), 'Drama', 'Maria Alejandra Navarro', 'Actor', 'Oscar'), (81157374, 'Run', datetime.date(2021, 5, 21), 'Adventure', 'aria Lopez Gutierrez', 'Director', 'Grammy')]]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

## Cargamos datos a la dimension USER

Procedemos a leer un archivo externo users.csv

In [12]:
users = pd.read_csv("./data/users.csv", sep='|')
users

Unnamed: 0,idUser,username,country,subscription
0,1002331,user123,USA,Premium
1,1002332,gamerGirl97,Canada,Basic
2,1002333,techMaster,UK,Premium
3,1002334,soccerFan,Brazil,Basic
4,1002335,travelBug,Australia,Premium
5,1002336,musicLover,France,Basic
6,1002337,foodie88,Italy,Premium
7,1002338,bookWorm23,Germany,Basic
8,1002339,fitnessJunk,Mexico,Premium
9,10023310,movieBuff,Japan,Basic


In [13]:
users = users.rename(columns={'idUser': 'userID'})
users


Unnamed: 0,userID,username,country,subscription
0,1002331,user123,USA,Premium
1,1002332,gamerGirl97,Canada,Basic
2,1002333,techMaster,UK,Premium
3,1002334,soccerFan,Brazil,Basic
4,1002335,travelBug,Australia,Premium
5,1002336,musicLover,France,Basic
6,1002337,foodie88,Italy,Premium
7,1002338,bookWorm23,Germany,Basic
8,1002339,fitnessJunk,Mexico,Premium
9,10023310,movieBuff,Japan,Basic


In [14]:
users.to_sql('dimUser',conn,if_exists='append', index=False)

20

# Cargamos datos a la tabla de hechos

In [15]:
users_id=users["userID"]
movies_id=movies_data["movieID"]

In [16]:
watchs_data=pd.merge(users_id,movies_id, how="cross")
watchs_data

Unnamed: 0,userID,movieID
0,1002331,80192187
1,1002331,80210920
2,1002331,81157374
3,1002332,80192187
4,1002332,80210920
5,1002332,81157374
6,1002333,80192187
7,1002333,80210920
8,1002333,81157374
9,1002334,80192187


In [17]:
import random
from datetime import datetime, timedelta
import random

def gen_rating():
    # Generar un número aleatorio entre 0 y 5 con 1 solo decimal
    numero_aleatorio = round(random.uniform(0, 5), 1)
    # Mostrar el número aleatorio
    return numero_aleatorio

def gen_timestamp():
    # Generar un timestamp aleatorio dentro de un rango específico
    start_date = datetime(2024, 1, 15)
    end_date = datetime(2024, 4, 6)

    # Calcular un valor aleatorio entre start_date y end_date
    random_date = start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds())))

    # Mostrar el timestamp aleatorio
    return random_date

In [18]:
watchs_data["rating"]=watchs_data["movieID"].apply(lambda x: gen_rating())
watchs_data["timestamp"]=watchs_data["userID"].apply(lambda x: gen_timestamp())

In [19]:
watchs_data

Unnamed: 0,userID,movieID,rating,timestamp
0,1002331,80192187,3.9,2024-03-28 03:00:09
1,1002331,80210920,1.8,2024-03-11 03:08:17
2,1002331,81157374,2.0,2024-01-20 06:34:24
3,1002332,80192187,3.5,2024-04-01 05:51:39
4,1002332,80210920,1.3,2024-03-14 19:30:02
5,1002332,81157374,4.9,2024-02-25 20:16:54
6,1002333,80192187,0.3,2024-03-30 16:16:28
7,1002333,80210920,2.1,2024-03-24 20:39:18
8,1002333,81157374,1.0,2024-01-16 01:15:18
9,1002334,80192187,4.0,2024-03-25 00:47:13


In [20]:
# Cargamos tabla de hechos
watchs_data.to_sql("FactWatchs", conn, if_exists='append', index=False)

60

In [21]:
### SET FOREIGN_KEY_CHECKS = 0;
### SET FOREIGN_KEY_CHECKS = 1;

