In [1]:
# Step 1 - import packages and get environment variables

from sqlalchemy import create_engine
import dotenv
import os
import pandas as pd
from sqlalchemy import create_engine
import warnings

# Load .env
dotenv.load_dotenv()

DB_PASS = os.getenv('DB_PASS')

In [2]:
# Step 2 - jupyter notebook configuration

# Warnings ignore
warnings.simplefilter('ignore')

# Show complete dataset in
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Matplotlib
%matplotlib inline

In [3]:
# Step 3 - import csvs
actor=pd.read_csv('./data/actor.csv')
category=pd.read_csv('./data/category.csv')
film=pd.read_csv('./data/film.csv')
inventory=pd.read_csv('./data/inventory.csv')
language=pd.read_csv('./data/language.csv')
rental=pd.read_csv('./data/rental.csv')
film_actor=pd.read_csv('./data/old_HDD.csv')

In [4]:
# Step 4 - Check if dataframes have null values
nan_cols_actor=actor.isna().sum()
print("\nACTOR NULL COLUMNS:")
print(nan_cols_actor[nan_cols_actor>0])

nan_cols_category=category.isna().sum()
print("\nCATEGORY NULL COLUMNS:")
print(nan_cols_category[nan_cols_category>0])

nan_cols_film=film.isna().sum()
print("\nFILM NULL COLUMNS:")
print(nan_cols_film[nan_cols_film>0])

nan_cols_inventory=inventory.isna().sum()
print("\nINVENTORY NULL COLUMNS:")
print(nan_cols_inventory[nan_cols_inventory>0])

nan_cols_language=language.isna().sum()
print("\nLANGUAGE NULL COLUMNS:")
print(nan_cols_language[nan_cols_language>0])

nan_cols_rental=rental.isna().sum()
print("\nRENTAL NULL COLUMNS")
print(nan_cols_rental[nan_cols_rental>0])

nan_cols_film_actor=film_actor.isna().sum()
print("\nfilm_actor NULL COLUMNS")
print(nan_cols_film_actor[nan_cols_film_actor>0])


ACTOR NULL COLUMNS:
Series([], dtype: int64)

CATEGORY NULL COLUMNS:
Series([], dtype: int64)

FILM NULL COLUMNS:
original_language_id    1000
dtype: int64

INVENTORY NULL COLUMNS:
Series([], dtype: int64)

LANGUAGE NULL COLUMNS:
Series([], dtype: int64)

RENTAL NULL COLUMNS
Series([], dtype: int64)

film_actor NULL COLUMNS
Series([], dtype: int64)


In [5]:
# Step 5 - Delete original_language_id col and all "last_update" columns
film.drop("original_language_id", axis=1, inplace=True)

actor.drop("last_update", axis=1, inplace=True)
category.drop("last_update", axis=1, inplace=True)
film.drop("last_update", axis=1, inplace=True)
inventory.drop("last_update", axis=1, inplace=True)
language.drop("last_update", axis=1, inplace=True)
rental.drop("last_update", axis=1, inplace=True)

In [6]:
# Step 6 - Check if there are duplicated rows
print(actor.duplicated().sum())
print(category.duplicated().sum())
print(film.duplicated().sum())
print(inventory.duplicated().sum())
print(language.duplicated().sum())
print(rental.duplicated().sum())
print(film_actor.duplicated().sum())

0
0
0
0
0
0
0


In [7]:
# Step 7 - Check if there are duplicated rows in some specific subsets
# In order to make the relations with old_HDD.csv data

# 7.1 - It's necessary to check in the film's dataframe that all films titles are unique
print(film.duplicated(subset=['title']).sum())

# 7.2 - It's necessary to check in actor's dataframe that the tupla (first_name-last_name) are unique
print(actor.duplicated(subset=['first_name', 'last_name']).sum())

0
1


In [8]:
# Step 8 - There is one duplicated in the tupla (first_name-last_name) of actor's dataframe.
# We need to delete it, in order to not have ambiguous relations.
actor.drop_duplicates(subset =["first_name", "last_name"], inplace = True)

print(actor.duplicated(subset=['first_name', 'last_name']).sum())

0


In [9]:
# Step 9 - Check if there are more than one category from film
print(film_actor.groupby('title')['category_id'].transform('nunique').max())

1


In [10]:
# Step 10 - There is not more than one category from film.
# For that reason, we could join film dataframe with film_actor dataframe by title...
film = film.set_index('title').join(film_actor[["title", "category_id"]].set_index('title'), how='left')

In [11]:
# Step 11 - ...and then drop duplicates
film.drop_duplicates(inplace = True)

In [12]:
# Step 12 - There are some films with category and for this reason we created an "Uncategorized" category
new = pd.DataFrame({'category_id': [17], 'name' : ['Uncategorized']})
category = pd.concat([category, new], ignore_index = True, axis = 0)

In [13]:
# Set 13 - Fill all NaN "category_id" fields with the "Uncategorized" id created in the last step
film.category_id.fillna(17, inplace=True)

In [14]:
# Step 14 - Reset index
film.reset_index(inplace=True)

In [15]:
# Step 15 - Transform "category_id" type to integer
film.category_id = film.category_id.astype('int')

In [16]:
# Step 16 - Join
film_actor = film_actor.set_index('title').join(film[["film_id", "title"]].set_index('title'), how='left')

In [17]:
# Step 17 - Reset index
film_actor.reset_index(inplace=True)

In [18]:
# Step 18 - Join
film_actor = film_actor[["first_name", "last_name", "film_id"]].set_index(["first_name", "last_name"]).join(actor[["actor_id", "first_name", "last_name"]].set_index(["first_name", "last_name"]), how='left')

In [19]:
# Step 19 - Reset index
film_actor.reset_index(drop=True, inplace=True)

In [20]:
# Step 20 - Create MySQL engine
str_conn='mysql+pymysql://root:' + DB_PASS + '@localhost:3306'
motor=create_engine(str_conn)

In [26]:
# Step 21 - Create database
create_db = 'CREATE DATABASE IF NOT EXISTS apps;'
motor.execute(create_db)

ResourceClosedError: This result object does not return rows. It has been closed automatically.

In [22]:
# Step 21 - Use database
use_db = 'use apps;'
motor.execute(use_db)
# db=conn.connect(host='localhost', user='root', passwd='password', database='productos')
# cursor=db.cursor()

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f88f0aecd90>

In [23]:
create_table_category = '''
    CREATE TABLE IF NOT EXISTS category (
        category_id int primary key,
        name varchar(100)
    );
'''
motor.execute(create_table_category)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f8900168190>

In [24]:
for i in range(len(category)):
    insert_query=f"insert into category ({','.join(category.columns)}) values {tuple(category.iloc[i].values)};"
    motor.execute(insert_query)

In [25]:
film

Unnamed: 0,title,film_id,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,category_id
0,ACADEMY DINOSAUR,1,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",6
1,ACE GOLDFINGER,2,A Astounding Epistle of a Database Administrat...,2006,1,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",11
2,ADAPTATION HOLES,3,A Astounding Reflection of a Lumberjack And a ...,2006,1,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",6
3,AFFAIR PREJUDICE,4,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",17
4,AFRICAN EGG,5,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,6,2.99,130,22.99,G,Deleted Scenes,17
5,AGENT TRUMAN,6,A Intrepid Panorama of a Robot And a Boy who m...,2006,1,3,2.99,169,17.99,PG,Deleted Scenes,9
6,AIRPLANE SIERRA,7,A Touching Saga of a Hunter And a Butler who m...,2006,1,6,4.99,62,28.99,PG-13,"Trailers,Deleted Scenes",17
7,AIRPORT POLLOCK,8,A Epic Tale of a Moose And a Girl who must Con...,2006,1,6,4.99,54,15.99,R,Trailers,17
8,ALABAMA DEVIL,9,A Thoughtful Panorama of a Database Administra...,2006,1,3,2.99,114,21.99,PG-13,"Trailers,Deleted Scenes",11
9,ALADDIN CALENDAR,10,A Action-Packed Tale of a Man And a Lumberjack...,2006,1,6,4.99,63,24.99,NC-17,"Trailers,Deleted Scenes",15
