# Loading data experiments

## Loading data from football-data.co.uk 

In [1]:
import pandas as pd
import requests

url = f"https://www.football-data.co.uk/mmz4281/2425/E0.csv"
response = requests.get(url)
response.raise_for_status()

In [2]:
from io import BytesIO

df = pd.read_csv(BytesIO(response.content))

In [3]:
df.head(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA
0,E0,16/08/2024,20:00,Man United,Fulham,1,0,H,0,0,...,1.86,2.07,1.83,2.11,1.88,2.11,1.82,2.05,1.9,2.08
1,E0,17/08/2024,12:30,Ipswich,Liverpool,0,2,A,0,0,...,2.05,1.88,2.04,1.9,2.2,2.0,1.99,1.88,2.04,1.93
2,E0,17/08/2024,15:00,Arsenal,Wolves,2,0,H,1,0,...,2.02,1.91,2.0,1.9,2.05,1.93,1.99,1.87,2.02,1.96
3,E0,17/08/2024,15:00,Everton,Brighton,0,3,A,0,1,...,1.87,2.06,1.86,2.07,1.92,2.1,1.83,2.04,1.88,2.11
4,E0,17/08/2024,15:00,Newcastle,Southampton,1,0,H,1,0,...,1.87,2.06,1.88,2.06,1.89,2.1,1.82,2.05,1.89,2.1


In [None]:
# discover categorical columns
df.select_dtypes(include=['object']).columns.tolist()

['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR', 'HTR', 'Referee']

## Data cleaning

In [5]:
# convert columns to lower snake case
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")

# set date as datetime
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

In [6]:
df.dtypes["date"]

dtype('<M8[ns]')

In [7]:
df.head(5)

Unnamed: 0,div,date,time,hometeam,awayteam,fthg,ftag,ftr,hthg,htag,...,b365cahh,b365caha,pcahh,pcaha,maxcahh,maxcaha,avgcahh,avgcaha,bfecahh,bfecaha
0,E0,2024-08-16,20:00,Man United,Fulham,1,0,H,0,0,...,1.86,2.07,1.83,2.11,1.88,2.11,1.82,2.05,1.9,2.08
1,E0,2024-08-17,12:30,Ipswich,Liverpool,0,2,A,0,0,...,2.05,1.88,2.04,1.9,2.2,2.0,1.99,1.88,2.04,1.93
2,E0,2024-08-17,15:00,Arsenal,Wolves,2,0,H,1,0,...,2.02,1.91,2.0,1.9,2.05,1.93,1.99,1.87,2.02,1.96
3,E0,2024-08-17,15:00,Everton,Brighton,0,3,A,0,1,...,1.87,2.06,1.86,2.07,1.92,2.1,1.83,2.04,1.88,2.11
4,E0,2024-08-17,15:00,Newcastle,Southampton,1,0,H,1,0,...,1.87,2.06,1.88,2.06,1.89,2.1,1.82,2.05,1.89,2.1


## Loading epl data to s3

In [8]:
from io import BytesIO

import boto3
from uuid import uuid4

client = boto3.client('s3')

bucket_name = f'test-{uuid4()}'

# Create a new S3 bucket
client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={
	'LocationConstraint': client.meta.region_name
})

# Upload the DataFrame to S3
file_name = '2425.e0.parquet'

parquet_buffer = BytesIO()
df.to_parquet(parquet_buffer, index=False)
client.put_object(Bucket=bucket_name, Body=parquet_buffer.getvalue(), Key=file_name)

{'ResponseMetadata': {'RequestId': 'RXZA53EN8B6GGTTV',
  'HostId': 'pbm4PXX99LfGisIogVE7nLZaQnHvGKravOHVL/qPT1S+KqWA3n5Il/GAdmBmKhbQj22fu0oGSchgTn+XnBz592vVuRw6BECx',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'pbm4PXX99LfGisIogVE7nLZaQnHvGKravOHVL/qPT1S+KqWA3n5Il/GAdmBmKhbQj22fu0oGSchgTn+XnBz592vVuRw6BECx',
   'x-amz-request-id': 'RXZA53EN8B6GGTTV',
   'date': 'Sat, 26 Jul 2025 20:48:13 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"431dc2ab9731eba634f3aa93219a5cc0"',
   'x-amz-checksum-crc32': 'O/CBGQ==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"431dc2ab9731eba634f3aa93219a5cc0"',
 'ChecksumCRC32': 'O/CBGQ==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}

## Loading data to postgres

In [1]:
import sys
sys.path.append('..')

from src.core.config import get_config

config = get_config()

In [24]:
# Dump the configuration to JSON
config.model_dump_json()

'{"app_name":"epl-prediction","environment":"development","postgres_user":"admin","postgres_password":"admin","postgres_server":"localhost","postgres_port":5432,"postgres_db":"monitoring","aws_region":"eu-south-1","s3_bucket_name":null,"model_version":"v1.0","model_path":null,"api_host":"0.0.0.0","api_port":8000}'

In [2]:
from sqlalchemy import create_engine, text

engine = create_engine(config.database_url)

In [30]:
table_name = "test_table"

In [28]:
df.to_sql(
    name=table_name,
    con=engine,
    if_exists='replace',
    index=False,
    method='multi'
)

-1

## Clean up

In [29]:
client.delete_object(Bucket=bucket_name, Key=file_name)
client.delete_bucket(Bucket=bucket_name)

{'ResponseMetadata': {'RequestId': 'WTRBJ80QWAEFD6Y4',
  'HostId': '1l7LjlGoXL0UhaJ1XNv1vXyl/uZbw+tPAw/39eVB7nuT0c+X2mB+v/0IkmG6IK0eYpQiSHXUF7Xyy/j3fo4Uy7ZEj5uxEvFS',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': '1l7LjlGoXL0UhaJ1XNv1vXyl/uZbw+tPAw/39eVB7nuT0c+X2mB+v/0IkmG6IK0eYpQiSHXUF7Xyy/j3fo4Uy7ZEj5uxEvFS',
   'x-amz-request-id': 'WTRBJ80QWAEFD6Y4',
   'date': 'Sat, 26 Jul 2025 21:34:50 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

In [31]:
with engine.connect() as connection:
    connection.execute(text(f"DROP TABLE {table_name}"))
    connection.commit()  # Explicitly commit the transaction