In [3]:
import pandas as pd
from sqlalchemy import create_engine, text as sql_text
from sqlalchemy.schema import CreateSchema
from ydata_profiling import ProfileReport
import numpy as np
import great_expectations as gx
from utils import *

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
context = gx.get_context()

In [7]:
passwd = input('Digite a senha do banco de dados: ')
engine = create_engine(f'postgresql://postgres:{passwd}@localhost:5433/analytics_eng')

## Carrega os dados bronze

In [8]:
query = "SELECT * FROM bronze.raw_metadata;"
with engine.connect() as connection:
    df_bronze = pd.read_sql(query, connection)

In [9]:
df_bronze = gx.from_pandas(df_bronze)

In [10]:
df_bronze.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [11]:
df_bronze.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [12]:
df_bronze.profile

<bound method DataAsset.profile of        adult                              belongs_to_collection    budget  \
0      False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1      False                                               None  65000000   
2      False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3      False                                               None  16000000   
4      False  {'id': 96871, 'name': 'Father of the Bride Col...         0   
...      ...                                                ...       ...   
45461  False                                               None         0   
45462  False                                               None         0   
45463  False                                               None         0   
45464  False                                               None         0   
45465  False                                               None         0   

                                        

In [13]:
df_cln = df_bronze.copy()

### Colunas importantes

In [14]:
colunas_importantes = ['adult', 'budget', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count']
df_cln = df_cln[colunas_importantes]

In [15]:
def clean_data(df):
    list_adult = []
    for val in df.adult:
        if type(val) == bool:
            list_adult.append(val)
        else:
            if val in ('True', 'true'):
                list_adult.append(True)
            elif val in ('False', 'false'):
                list_adult.append(False)
            else:
                list_adult.append(np.nan)
    df['adult'] = list_adult
    df['video'] = df['video'].astype(bool)
    
    num_cols = ['budget', 'revenue', 'runtime', 'vote_average', 'vote_count', 'popularity']
    for col in num_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

    text_cols = ['homepage', 'tagline', 'overview']
    for col in text_cols:
        df.fillna({col: np.nan}, inplace=True)

    return df


In [16]:
df_cln = clean_data(df_cln)


In [17]:
df_cln.head()

Unnamed: 0,adult,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,video,vote_average,vote_count
0,False,30000000.0,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,Toy Story,False,7.7,5415.0
1,False,65000000.0,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,0.0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,16000000.0,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,0.0,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Removendo dados duplicados

In [18]:
print("Antes: ", len(df_cln))
df_cln = df_cln.drop_duplicates()
print("Depois: ", len(df_cln))

Antes:  45466
Depois:  45449


### Eliminando as colunas com muitos dados faltando (mais que 50%)

In [19]:
df_cln['homepage']

0        http://toystory.disney.com/toy-story
1                                         NaN
2                                         NaN
3                                         NaN
4                                         NaN
                         ...                 
45461    http://www.imdb.com/title/tt6209470/
45462                                     NaN
45463                                     NaN
45464                                     NaN
45465                                     NaN
Name: homepage, Length: 45449, dtype: object

In [20]:
list_1 = []
limit = 0.5
for col in df_cln.columns:
    perc = df_cln[col].isnull().sum() / len(df_cln)
    if perc > limit:
        list_1.append(col)
        print(col, perc)

homepage 0.8288191159321437
tagline 0.5509912209289534


In [21]:
df_cln = df_cln.drop(list_1, axis=1)

### Eliminando linhas com dados faltando

In [22]:
df_cln.dropna(inplace=True)

# Tabela Silver

In [23]:
df_silver = df_cln.copy()

In [24]:
silver_validator = context.sources.add_pandas("silver_df_source").read_dataframe(df_silver,asset_name="default")


In [25]:
expected_columns = ['adult', 'budget', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'release_date', 'revenue', 'runtime', 'status', 'title', 'video', 'vote_average', 'vote_count']
silver_validator.expect_table_columns_to_match_ordered_list(expected_columns)
silver_validator.expect_column_values_to_be_in_set('adult', [True, False])
silver_validator.expect_column_values_to_be_in_set('video', [True, False])
silver_validator.expect_column_values_to_be_of_type('runtime', 'float')
silver_validator.expect_column_values_to_be_of_type('vote_average', 'float')
silver_validator.expect_column_values_to_be_of_type('vote_count', 'float')
silver_validator.expect_column_values_to_be_in_set('original_language', list(df_silver['original_language'].unique()))
silver_validator.expect_column_values_to_be_in_set('status', list(df_silver['status'].unique()))

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 44332,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [49]:
silver_validator.save_expectation_suite(discard_failed_expectations=False) 

In [51]:
context.list_expectation_suite_names()

['default']

In [53]:
checkpoint = context.add_or_update_checkpoint(name="primeiro_checkpoint", validator=silver_validator)

In [54]:
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/31 [00:00<?, ?it/s]

In [55]:
silver_validator.save_expectation_suite(discard_failed_expectations=False)

In [56]:
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/31 [00:00<?, ?it/s]

In [58]:
context.build_data_docs()

{'local_site': 'file:///var/folders/b0/yrq36_n517vgnb8j824rt8br0000gn/T/tmplctgc7us/index.html'}

## Cria um novo Schema para a as tabelas Silver

In [59]:
create_schemas(engine, "silver")

Schema created successfully.


## Salvando os dados na tabela Silver

In [60]:
df_silver.to_sql('metadata_imdb', engine, schema='silver', if_exists='replace', index=False)

332