In [13]:
import os
import pandas as pd
import pandas_gbq as pd_gbq
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

#### Approach
- Define jsons containing parameters for every table which needs to be created in GCP. Parameters include project name, dataset naming, table naming, schema definition & csv location

- Create list including all jsons

- Loop through jsons and create dataset and table for each dataset/table

In [23]:
table_cln_listings = {
      'project':'smgmaxschlafli'
    , 'dataset_location':'EU'
    , 'dataset_id':"smgmaxschlafli.cln_listings"
    , 'table_id':f"smgmaxschlafli.cln_listings.cln_listings"
    , 'table_scheme':[
        bigquery.SchemaField("listing_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("price", "FLOAT64", mode="REQUIRED"),
        bigquery.SchemaField("listing_date_key", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("platform_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("product_type_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("status_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("user_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("creation_date", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("last_update_date", "DATE", mode="NULLABLE"),  
    ]
    , 'csv_location':'GDT – BI Dev Challenge/challenge_A_task_02_dataset/challenge_A_task_02_star_schema_dataset/cln_listings.csv'
}

table_fct_listings = {
      'project':'smgmaxschlafli'
    , 'dataset_location':'EU'
    , 'dataset_id':"smgmaxschlafli.fct_listings"
    , 'table_id':f"smgmaxschlafli.fct_listings.fct_listings"
    , 'table_scheme':[
        bigquery.SchemaField("listing_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("price", "FLOAT64", mode="REQUIRED"),
        bigquery.SchemaField("valid_from", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("valid_to", "DATE", mode="NULLABLE"),
        bigquery.SchemaField("listing_date_key", "DATE", mode="REQUIRED"),  
        bigquery.SchemaField("platform_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("product_type_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("status_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("user_id", "INTEGER", mode="REQUIRED"),
    ]
    , 'csv_location':'GDT – BI Dev Challenge/challenge_A_task_02_dataset/challenge_A_task_02_star_schema_dataset/fct_listings_clean.csv'
}

table_dim_platform = {
      'project':'smgmaxschlafli'
    , 'dataset_location':'EU'
    , 'dataset_id':"smgmaxschlafli.dim_platform"
    , 'table_id':f"smgmaxschlafli.dim_platform.dim_platform"
    , 'table_scheme':[
        bigquery.SchemaField("platform_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("platform", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("valid_from", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("valid_to", "DATE", mode="NULLABLE"),
    ]
    , 'csv_location':'GDT – BI Dev Challenge/challenge_A_task_02_dataset/challenge_A_task_02_star_schema_dataset/dim_platform.csv'
}


table_dim_dates = {
      'project':'smgmaxschlafli'
    , 'dataset_location':'EU'
    , 'dataset_id':"smgmaxschlafli.dim_date"
    , 'table_id':f"smgmaxschlafli.dim_date.dim_date"
    , 'table_scheme':[
        bigquery.SchemaField("date_key", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("day", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("week", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("month", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("quarter", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("weekday", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("month_text", "STRING", mode="REQUIRED"), ]
    , 'csv_location':'GDT – BI Dev Challenge/challenge_A_task_02_dataset/challenge_A_task_02_star_schema_dataset/dim_date.csv'
}


table_product_type = {
      'project':'smgmaxschlafli'
    , 'dataset_location':'EU'
    , 'dataset_id':"smgmaxschlafli.dim_product_type"
    , 'table_id':f"smgmaxschlafli.dim_product_type.dim_product_type"
    , 'table_scheme':[
        bigquery.SchemaField("product_type_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("product_type", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("product_type_tags", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("product_type_tags_weight_kg", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("product_type_tags_color", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("valid_from", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("valid_to", "DATE", mode="NULLABLE"),
    ]
    , 'csv_location':'GDT – BI Dev Challenge/challenge_A_task_02_dataset/challenge_A_task_02_star_schema_dataset/dim_product_type.csv'
}


table_dim_status = {
      'project':'smgmaxschlafli'
    , 'dataset_location':'EU'
    , 'dataset_id':"smgmaxschlafli.dim_status"
    , 'table_id':f"smgmaxschlafli.dim_status.dim_status"
    , 'table_scheme':[
        bigquery.SchemaField("Active", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("status", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("valid_from", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("valid_to", "DATE", mode="NULLABLE"),
    ]
    , 'csv_location':'GDT – BI Dev Challenge/challenge_A_task_02_dataset/challenge_A_task_02_star_schema_dataset/dim_status.csv'
}


table_dim_user = {
      'project':'smgmaxschlafli'
    , 'dataset_location':'EU'
    , 'dataset_id':"smgmaxschlafli.dim_user"
    , 'table_id':f"smgmaxschlafli.dim_user.dim_user"
    , 'table_scheme':[
        bigquery.SchemaField("user_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("location", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("location_city", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("location_country", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("valid_from", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("valid_to", "DATE", mode="NULLABLE"),
    ]
    , 'csv_location':'GDT – BI Dev Challenge/challenge_A_task_02_dataset/challenge_A_task_02_star_schema_dataset/dim_user.csv'
}


tables = [
    table_cln_listings
  , table_fct_listings
  , table_dim_platform
  , table_dim_dates
  , table_product_type
  , table_dim_status
  , table_dim_user
]

In [25]:
for table_params in tables:    
    project = table_params['project']
    dataset_id = table_params['dataset_id']
    dataset = bigquery.Dataset(dataset_id)
    dataset.location = table_params['dataset_location']
    table_schema = table_params['table_scheme']
    table_id = table_params['table_id']
    csv_location = table_params['csv_location']
    
    try:
        dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
        print(f"Created dataset {dataset_id}")
    except Exception as e:
        print(e)

    # Create Table
    table = bigquery.Table(table_id, schema=table_schema)
    try:
        table = client.create_table(table)  
        print(f"Created table {table_id}")
    except Exception as e:
        print(e)

    # Populate Table
    df = pd.read_csv(csv_location)
    try:
        pd_gbq.to_gbq(df, table_id, project_id=project, api_method='load_csv')
        print(f"Populated table {dataset_id}.{table_id}")
    except Exception as e:
        print(e)

409 POST https://bigquery.googleapis.com/bigquery/v2/projects/smgmaxschlafli/datasets?prettyPrint=false: Already Exists: Dataset smgmaxschlafli:cln_listings
Created table smgmaxschlafli.cln_listings.cln_listings


1it [00:04,  4.54s/it]


Populated table smgmaxschlafli.cln_listings.smgmaxschlafli.cln_listings.cln_listings
Created dataset smgmaxschlafli.fct_listings
Created table smgmaxschlafli.fct_listings.fct_listings


1it [00:03,  3.22s/it]


Populated table smgmaxschlafli.fct_listings.smgmaxschlafli.fct_listings.fct_listings
Created dataset smgmaxschlafli.dim_platform
Created table smgmaxschlafli.dim_platform.dim_platform


1it [00:03,  3.43s/it]


Populated table smgmaxschlafli.dim_platform.smgmaxschlafli.dim_platform.dim_platform
Created dataset smgmaxschlafli.dim_date
Created table smgmaxschlafli.dim_date.dim_date


1it [00:02,  2.13s/it]


Populated table smgmaxschlafli.dim_date.smgmaxschlafli.dim_date.dim_date
Created dataset smgmaxschlafli.dim_product_type
Created table smgmaxschlafli.dim_product_type.dim_product_type


1it [00:02,  2.60s/it]


Populated table smgmaxschlafli.dim_product_type.smgmaxschlafli.dim_product_type.dim_product_type
Created dataset smgmaxschlafli.dim_status
Created table smgmaxschlafli.dim_status.dim_status


1it [00:00,  1.79it/s]


Reason: Provided Schema does not match Table smgmaxschlafli:dim_status.dim_status. Field Active is missing in new schema
Created dataset smgmaxschlafli.dim_user
Created table smgmaxschlafli.dim_user.dim_user


1it [00:02,  2.78s/it]

Populated table smgmaxschlafli.dim_user.smgmaxschlafli.dim_user.dim_user



