Author: Marco Pellegrino\
Year: 2024

With this notebook, data is first formatted in a database-friendly format (date, page, traffic) and then inserted into a BigQuery table.

In [1]:
import pandas as pd
from google.cloud import bigquery

# Load Data

In [2]:
# Load the dataset
df = pd.read_csv('../data/raw/train_1.csv')

In [8]:
df.columns

Index(['page', '2015-07-01', '2015-07-02', '2015-07-03', '2015-07-04',
       '2015-07-05', '2015-07-06', '2015-07-07', '2015-07-08', '2015-07-09',
       ...
       '2016-12-22', '2016-12-23', '2016-12-24', '2016-12-25', '2016-12-26',
       '2016-12-27', '2016-12-28', '2016-12-29', '2016-12-30', '2016-12-31'],
      dtype='object', length=551)

In [23]:
df.head(3)

Unnamed: 0,page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2016-12-22,2016-12-23,2016-12-24,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,32.0,63.0,15.0,26.0,14.0,20.0,22.0,19.0,18.0,20.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,17.0,42.0,28.0,15.0,9.0,30.0,52.0,45.0,26.0,20.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,3.0,1.0,1.0,7.0,4.0,4.0,6.0,3.0,4.0,17.0


# Process Data

Make column names lowercase

In [None]:
df.columns = map(str.lower, df.columns) # make column names lowercase

Transform the data such that each row is an observation and each column is a variable

In [None]:
# Melt the DataFrame to a long format
df = df.melt(id_vars=['page'], var_name='date', value_name='traffic')

In [None]:
df['date'] = pd.to_datetime(df['date'])

# BigQuery

Connect to BigQuery

In [14]:
# Initialize a BigQuery client
client = bigquery.Client(project="web-traffic-time-series",)

In [18]:
# Define the BigQuery dataset and table name
dataset_id = 'web_traffic_dataset'
table_id = 'web_traffic_tb'
table_ref = client.dataset(dataset_id).table(table_id)

In [27]:
# Define the schema
schema = [
    bigquery.SchemaField("page", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("date", "DATE", mode="REQUIRED"),
    bigquery.SchemaField("traffic", "INTEGER", mode="NULLABLE"),
]

# Create the table if it does not exist
table = bigquery.Table(table_ref, schema=schema)
table = client.create_table(table, exists_ok=True)

In [28]:
# Load the DataFrame into BigQuery
job = client.load_table_from_dataframe(melted_df, table_ref)

# Wait for the load job to complete
job.result()

print(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}.")

Loaded 79784650 rows into web_traffic_dataset:web_traffic_tb.
