In [1]:
# this will be used to access the bigquery client
from google.cloud import bigquery
# this will be used to access the service account modules - passwords and access tokens
from google.oauth2 import service_account
# this will be used for returning data, increases the speed
from google.cloud import bigquery_storage
# for os library
import os


#general dataframe usage
import pandas as pd
# required for certain returns
import pandas_gbq
# exception for a short script
from google.api_core.exceptions import NotFound

In [2]:
# Path to service account if stored locally
SERVICE_ACCOUNT_JSON = r"..\portfolio2026-485323-70c4d609b156.json"

# Credentials to access Big Query via the service account
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_JSON)

# BigQuery client
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# BigQuery Storage client
bq_storage_client = bigquery_storage.BigQueryReadClient(credentials=credentials)

# Gather the datasets inside the project for validation of location

In [3]:
datasets = list(client.list_datasets())

In [4]:
if datasets:
    print("Datasets in project:")
    for dataset in datasets:
        print(dataset.dataset_id)
else:
    print("No datasets found.")


# function to see if table already exists in big query data set
def table_exists(client, full_table_id):
    try:
        client.get_table(full_table_id)
        return True
    except NotFound:
        return False

Datasets in project:
portfolio1


# Read in files and concatenate into a single data frame

In [5]:
dataframes = []

row_count = 0

for dirpath, dirs, files in os.walk(r"data_files"):
    for n, file in enumerate(files):
        df = pd.read_csv(os.path.join(dirpath,file))
        df = df.drop('Player-additional', axis=1)
        df['Year'] = 2020 + n + 1
        if 'MP▼' in df.columns:
            df.rename(columns={'MP▼':'MP'}, inplace=True)

        if 'Team' in df.columns:
            df.rename(columns={'Team':'Tm'}, inplace=True) 
        dataframes.append(df)
        row_count += df.shape[0]

nba_df = pd.concat(dataframes)

assert nba_df.shape[0] == row_count, f"Expected {row_count} rows, but got {nba_df.shape[0]}"

In [6]:
nba_df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Trp-Dbl,Awards
0,1,Precious Achiuwa,PF,21,MIA,61,4,737,124,228,...,208,29,20,28,43,91,304,2021,,
1,2,Jaylen Adams,PG,24,MIL,7,0,18,1,8,...,3,2,0,0,0,1,2,2021,,
2,3,Steven Adams,C,27,NOP,58,58,1605,189,308,...,514,111,54,38,78,113,438,2021,,
3,4,Bam Adebayo,C,23,MIA,64,64,2143,456,800,...,573,346,75,66,169,145,1197,2021,,
4,5,LaMarcus Aldridge,C,35,TOT,26,23,674,140,296,...,118,49,11,29,27,47,352,2021,,


# Process for Big Query upload

In [7]:
nba_df.columns = nba_df.columns.str.lower().str.strip().str.replace(r"[^\w]+","_", regex=True)

In [8]:
# Write to bigquery

# Write truncate will overwrite the table instead of appending
job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
                                    autodetect=True)

full_table_id = f"{credentials.project_id}.portfolio1.nba_data"

if table_exists(client, full_table_id):
    print(f"Table {full_table_id} already exists.")
    print("Overwriting existing table...")
else:
    print("Creating new table...")

load_job = client.load_table_from_dataframe(nba_df, full_table_id, job_config=job_config)
load_job.result()

Creating new table...


LoadJob<project=portfolio2026-485323, location=US, id=38c1b57f-33d0-4a4f-bf0e-f3d5c43e5480>