#Addressing Data Anomalies

In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
#Import everything we need
import pandas as pd
import pandas_gbq
from google.cloud import bigquery

In [4]:
#Initialize variables
project_id = "group-5-448704"
raw_dataset = "football_dataset_raw"
staging_dataset = "football_dataset_stg"

##Anomaly Type 5: Changed string to datetime for superbowl_ratings.date

In [5]:
# Query to change datatype
query_5 = """
SELECT
  SAFE_CAST(date AS DATETIME) AS date,
  _data_source,
  _load_time
FROM `{project_id}.{raw_dataset}.superbowl_ratings`
""".format(project_id=project_id, raw_dataset=raw_dataset)

# Execute the query and load the result into a DataFrame
table_5 = pandas_gbq.read_gbq(query_5, project_id=project_id)

# Write the transformed data to the staging table
table_5.to_gbq(f"{staging_dataset}.superbowl_ratings_date", project_id=project_id, if_exists="replace")

Downloading: 100%|[32m██████████[0m|


  table_5.to_gbq(f"{staging_dataset}.superbowl_ratings_date", project_id=project_id, if_exists="replace")
100%|██████████| 1/1 [00:00<00:00, 2372.34it/s]


##Anomaly Type 6: Changed \n to null values for teams_data.team_division

---



In [6]:
# Query to replace /n values
query_6 = """
SELECT
  NULLIF(team_division, '\\n') AS team_division,
  _data_source,
  _load_time
FROM `{project_id}.{raw_dataset}.teams_data`
""".format(project_id=project_id, raw_dataset=raw_dataset)

# Execute the query and load the result into a DataFrame
table_6 = pandas_gbq.read_gbq(query_6, project_id=project_id)

# Write the transformed data to the staging table
table_6.to_gbq(f"{staging_dataset}.teams_data_teams_division_cleaned", project_id=project_id, if_exists="replace")


Downloading: 100%|[32m██████████[0m|


  table_6.to_gbq(f"{staging_dataset}.teams_data_teams_division_cleaned", project_id=project_id, if_exists="replace")
100%|██████████| 1/1 [00:00<00:00, 2117.27it/s]


##Anomaly Type 7: Changed win loss record to separate fields for 2024_weekly_stats.games_w_l_t. Made sure to keep them as INT64

In [None]:
# Query to separate fields
query_7 = """
SELECT
  SAFE_CAST(SPLIT(games_w_l_t, '-')[ORDINAL(1)] AS INT64) AS total_win,
  SAFE_CAST(SPLIT(games_w_l_t, '-')[ORDINAL(2)] AS INT64) AS total_lost,
  SAFE_CAST(SPLIT(games_w_l_t, '-')[ORDINAL(3)] AS INT64) AS total_tie,
  _data_source,
  _load_time
FROM `{project_id}.{raw_dataset}.2024_weekly_stats`
""".format(project_id=project_id, raw_dataset=raw_dataset)

# Execute the query and load the result into a DataFrame
table_7 = pandas_gbq.read_gbq(query_7, project_id=project_id)

# Write the transformed data to the staging table
table_7.to_gbq(f"{staging_dataset}.team_win_loss_tie_record", project_id=project_id, if_exists="replace")




Downloading: 100%|[32m██████████[0m|


  table_7.to_gbq(f"{staging_dataset}.team_win_loss_tie_record", project_id=project_id, if_exists="replace")
100%|██████████| 1/1 [00:00<00:00, 8388.61it/s]


##Removed fields that do not store useful data

In [None]:
# Query to remove unwanted fields and transform data
query_8 = """
SELECT
  stadium_name,
  stadium_location,
  stadium_open,
  stadium_close,
  stadium_type,
  stadium_weather_type,
  stadium_capacity,
  stadium_surface,
  stadium_latitude,
  stadium_longitude,
  _data_source,
  _load_time
FROM `{project_id}.{raw_dataset}.stadiums`
""".format(project_id=project_id, raw_dataset=raw_dataset)

# Execute the query and load the result into a DataFrame
result = pandas_gbq.read_gbq(query_8, project_id=project_id)

# Write the transformed data to the staging table
result.to_gbq(f"{staging_dataset}.new_stadiums", project_id=project_id, if_exists="replace")


Downloading: 100%|[32m██████████[0m|


  result.to_gbq(f"{staging_dataset}.new_stadiums", project_id=project_id, if_exists="replace")
100%|██████████| 1/1 [00:00<00:00, 734.17it/s]


##Renamed inconsistent field names

In [None]:
# Query to rename inconsistent field names
query_9 = """
SELECT
  weight as height,
  SAFE_CAST(college as INT64) as weight,
  age as college,
  _data_source,
  _load_time
FROM `{project_id}.{raw_dataset}.2024_player_predictions`
""".format(project_id=project_id, raw_dataset=raw_dataset)

# Execute the query and load the result into a DataFrame
result = pandas_gbq.read_gbq(query_9, project_id=project_id)

# Write the transformed data to the staging table
result.to_gbq(f"{staging_dataset}.renamed_2024_player_predictions", project_id=project_id, if_exists="replace")

Downloading: 100%|[32m██████████[0m|


  result.to_gbq(f"{staging_dataset}.renamed_2024_player_predictions", project_id=project_id, if_exists="replace")
100%|██████████| 1/1 [00:00<00:00, 1803.23it/s]


In [14]:
client = bigquery.Client(project="group-5-448704")  # Set the correct project ID

source_table_id1 = "group-5-448704.football_dataset_raw.spreadspoke_scores"
source_table_id2 = "group-5-448704.football_dataset_raw.yearly_player_data"
source_table_id3 = "group-5-448704.football_dataset_raw.yearly_team_data"

destination_table_id = "group-5-448704.football_dataset_stg.spreadspoke_scores"
destination_table_id1 = "group-5-448704.football_dataset_stg.yearly_player_data"
destination_table_id2 = "group-5-448704.football_dataset_stg.yearly_team_data"
job1 = client.copy_table(source_table_id1, destination_table_id)
job2 = client.copy_table(source_table_id2, destination_table_id1)
job3 = client.copy_table(source_table_id3, destination_table_id2)


Conflict: 409 Already Exists: Table group-5-448704:football_dataset_stg.yearly_player_data; reason: duplicate, message: Already Exists: Table group-5-448704:football_dataset_stg.yearly_player_data