In [1]:
import json
import pandas as pd
import re
from google.cloud import bigquery
from google.oauth2 import service_account
import os
from datetime import datetime

In [2]:
with open(f"df_all_brands_data_cat_1.json", mode="r", encoding="utf-8") as f:
    data = json.load(f)
    f.close()

df_data_all_car_brands = []
for i in data: # Loop through every page
    for j in i: # Loop through each car listing on a specific page
        df_data_all_car_brands.append(j)

df_data_all_car_brands = pd.DataFrame(df_data_all_car_brands)

# Step 14: Clean the data
df_data_all_car_brands_cleaned = df_data_all_car_brands.copy()
df_data_all_car_brands_cleaned.replace(to_replace="", value=None, inplace=True)
df_data_all_car_brands_cleaned["leistung"] = df_data_all_car_brands_cleaned["leistung"].apply(lambda x: int(re.findall(pattern="(?<=\().*(?=\sPS)", string=x)[0].replace(".", "")) if x is not None else x)
df_data_all_car_brands_cleaned["preis"] = df_data_all_car_brands_cleaned["preis"].apply(lambda x: int(''.join(re.findall(pattern="\d+", string=x))) if x is not None else x)
df_data_all_car_brands_cleaned["kilometer"] = df_data_all_car_brands_cleaned["kilometer"].apply(lambda x: int(''.join(re.findall(pattern="\d+", string=x))) if x is not None else x)
df_data_all_car_brands_cleaned["fahrzeughalter"] = df_data_all_car_brands_cleaned["fahrzeughalter"].apply(lambda x: int(x) if x is not None else x)
df_data_all_car_brands_cleaned["standort"] = df_data_all_car_brands_cleaned["standort"].apply(lambda x: re.findall(pattern="[A-za-z]+(?=-)", string=x)[0] if x is not None else x)
df_data_all_car_brands_cleaned["crawled_timestamp"] = datetime.now()

In [6]:
key_path_cwd = os.path.expanduser("~") + "/bq_credentials.json"
key_path1_home_dir = os.getcwd() + "/bq_credentials.json"
try:
    credentials = service_account.Credentials.from_service_account_file(
        key_path1_home_dir, scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )
    email_flag = "home_dir"
except FileNotFoundError:
    credentials = service_account.Credentials.from_service_account_file(
        key_path_cwd, scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )
    email_flag = "cwd"

# Now, instantiate the client and upload the table to BigQuery
client = bigquery.Client(project="web-scraping-371310", credentials=credentials)
job_config = bigquery.LoadJobConfig(
    schema = [
        bigquery.SchemaField("marke", "STRING"),
        bigquery.SchemaField("modell", "STRING"),
        bigquery.SchemaField("variante", "STRING"),
        bigquery.SchemaField("titel", "STRING"),
        bigquery.SchemaField("form", "STRING"),
        bigquery.SchemaField("fahrzeugzustand", "STRING"),
        bigquery.SchemaField("leistung", "FLOAT64"),
        bigquery.SchemaField("getriebe", "STRING"),
        bigquery.SchemaField("farbe", "STRING"),
        bigquery.SchemaField("preis", "INT64"),
        bigquery.SchemaField("kilometer", "FLOAT64"),
        bigquery.SchemaField("erstzulassung", "STRING"),
        bigquery.SchemaField("fahrzeughalter", "FLOAT64"),
        bigquery.SchemaField("standort", "STRING"),
        bigquery.SchemaField("fahrzeugbescheibung", "STRING"),
        bigquery.SchemaField("url_to_crawl", "STRING"),
        bigquery.SchemaField("page_rank", "INT64"),
        bigquery.SchemaField("total_num_pages", "INT64"),
        bigquery.SchemaField("crawled_timestamp", "TIMESTAMP"),
    ]
)
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND

# Upload the table
client.load_table_from_dataframe(
    dataframe=df_data_all_car_brands_cleaned,
    destination="web-scraping-371310.crawled_datasets.lukas_mobile_de",
    job_config=job_config
).result()

LoadJob<project=web-scraping-371310, location=EU, id=1cb80362-4807-43d2-b945-656402726fe5>