In [24]:
import pandas as pd
from sqlalchemy import create_engine, text as sql_text
from sqlalchemy.schema import CreateSchema
from ydata_profiling import ProfileReport
import numpy as np
import great_expectations as gx

In [2]:
passwd = input('Digite a senha do banco de dados: ')
engine = create_engine(f'postgresql://postgres:{passwd}@localhost/analytics_eng')

## Carrega os dados bronze

In [3]:
query = "SELECT * FROM bronze.raw_metadata;"
with engine.connect() as connection:
    df_bronze = pd.read_sql(query, connection)

In [36]:
df_bronze.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
df_bronze.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [17]:
df_cln = df_bronze.copy()

### Colunas importantes

In [18]:
colunas_importantes = ['adult', 'budget', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count']
df_cln = df_cln[colunas_importantes]

In [19]:
def clean_data(df):
    list_adult = []
    for val in df.adult:
        if type(val) == bool:
            list_adult.append(val)
        else:
            if val in ('True', 'true'):
                list_adult.append(True)
            elif val in ('False', 'false'):
                list_adult.append(False)
            else:
                list_adult.append(np.nan)
    df['adult'] = list_adult
    df['video'] = df['video'].astype(bool)
    
    num_cols = ['budget', 'revenue', 'runtime', 'vote_average', 'vote_count', 'popularity']
    for col in num_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

    text_cols = ['homepage', 'tagline', 'overview']
    for col in text_cols:
        df.fillna({col: np.nan}, inplace=True)

    return df


In [20]:
df_cln = clean_data(df_cln)


In [21]:
df_cln.head()

Unnamed: 0,adult,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,video,vote_average,vote_count
0,False,30000000.0,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,Toy Story,False,7.7,5415.0
1,False,65000000.0,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,0.0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,16000000.0,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,0.0,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Relatório com informações do perfil dos dados a partir da biblioteca "ydata_profiling"

In [22]:
profile = ProfileReport(df_cln, title="Pandas Profiling Report")
profile.to_file("/Users/jose/python_projects/Santander-Coders-2023-ED/05projeto_AE/metadata_silver.html")

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Released'')
  annotation = ("{:" + self.fmt + "}").format(val)
(using `df.profile_report(missing_diagrams={"Heatmap": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: '--'')
Summarize dataset: 100%|██████████| 64/64 [00:05<00:00, 12.07it/s, Completed]               
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.18s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.08it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 215.88it/s]


### Removendo dados duplicados

In [25]:
print("Antes: ", len(df_cln))
df_cln = df_cln.drop_duplicates()
print("Depois: ", len(df_cln))

Antes:  45466
Depois:  45449


### Eliminando as colunas com muitos dados faltando (mais que 50%)

In [30]:
df_cln['homepage']

0.8288191159321437

In [32]:
list_1 = []
limit = 0.5
for col in df_cln.columns:
    perc = df_cln[col].isnull().sum() / len(df_cln)
    if perc > limit:
        list_1.append(col)
        print(col, perc)

homepage 0.8288191159321437
tagline 0.5509912209289534


In [33]:
df_cln = df_cln.drop(list_1, axis=1)

### Eliminando linhas com dados faltando

In [37]:
df_cln.dropna(inplace=True)