In [1]:
# packages
import pandas as pd
import camelot
import PyPDF2
import os
import glob


In [2]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('../data/formatted_zone/formatted_zone.db')

In [5]:
data_path="../data/landing/persistent"

# Gets the file with the last date, 
# ex: for filename=cleaned_merged_seasons would return cleaned_merged_seasons_2023-09-22.csv
from datetime import datetime
def get_last_file(filename, fileformat="csv"):
    data_files = os.listdir(data_path)
    filtered_files = [f for f in data_files if f.startswith(filename) and f.endswith("." + fileformat)]
    format_str = "%Y-%m-%d"  # Date format
    most_recent_file = max(
        filtered_files, 
        key=lambda f: datetime.strptime(f[len(filename + "_"):-len("." + fileformat)], format_str)
    )
    return most_recent_file

In [6]:
# LOAD TO THE SQLITE DB
# Ex: data_file="cleaned_merged_seasons" would find the last file (cleaned_merged_seasons_2023-09-22.csv)
#     and load it to a table named "cleaned_merged_seasons"
def load_to_db(data_file,conn=conn,data_path=data_path):
    last_data_file = get_last_file(data_file)
    df = pd.read_csv(os.path.join(data_path, last_data_file))
    df.to_sql(data_file, con=conn, if_exists='replace', index=False)

In [36]:
# DATA LOADING
load_to_db("cleaned_merged_seasons")
load_to_db("football-data_2223")
load_to_db("master_team_list")

  df = pd.read_csv(os.path.join(data_path, last_data_file))


In [39]:
# Sample query
query = f"SELECT * FROM {data_file} LIMIT 5;"  # Adjust number as needed
sample_data = pd.read_sql_query(query, conn)
sample_data

Unnamed: 0,season_x,name,position,team_x,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW
0,2016-17,Aaron Cresswell,DEF,,0,0,0,0,0.0,454,...,2.0,0.0,0,0,0,0,55,0,0,1
1,2016-17,Aaron Lennon,MID,,0,0,6,0,0.3,142,...,1.0,0.0,1,0,0,0,60,1,0,1
2,2016-17,Aaron Ramsey,MID,,0,0,5,0,4.9,16,...,3.0,23.0,2,0,0,0,80,1,0,1
3,2016-17,Abdoulaye Doucouré,MID,,0,0,0,0,0.0,482,...,1.0,0.0,0,0,0,0,50,0,0,1
4,2016-17,Adam Forshaw,MID,,0,0,3,0,1.3,286,...,1.0,0.0,1,0,0,0,45,1,1,1


In [89]:
# CHECK THE TABLES ON THE DB
c = conn.cursor()
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = c.fetchall()
tables

[('cleaned_merged_seasons',), ('football-data_2223',), ('master_team_list',)]

In [22]:
# we need regex because the pdf is not read 100% accurately
import re

In [87]:
# This is the text at the top of the page where the weather tables are. We use it to know which pages we want to read
search_string = 'Selected UK readings at (L) 0000 and (R) 1200 UTC'
pattern = re.compile(r'\s*'.join(re.escape(word) for word in search_string.split())) # the whitespace can be read incorrectly so we allow for optional whitespace
# This is the columns without the date. When we need the date we append it with ['Date'] + columns
columns = ['Station_no', 'Station_name',
           '0000_PRESS', '0000_WDIR', '0000_WSPD', '0000_CLOUD', '0000_TEMP', '0000_TDEW',
           '1200_PRESS', '1200_WDIR', '1200_WSPD', '1200_CLOUD', '1200_TEMP', '1200_TDEW']

met_office_files = glob.glob(os.path.join(data_path, '*.pdf'))

for met_file in met_office_files:
    print(f"Processing {met_file}")
    with open(met_file, 'rb') as pdf_raw:
        pdf = PyPDF2.PdfReader(pdf_raw)

        df = pd.DataFrame(columns=['Date']+columns)
        for i in range(len(pdf.pages)):
            page_text = pdf.pages[i].extract_text()
            if re.search(pattern, page_text):
                table = camelot.read_pdf(met_file, pages=str(i+1))

                t_df = table[0].df
                t_df = t_df.iloc[2:]
                t_df.columns = columns

                # The top of the page is always:  "Daily Weather Summary for Sunday 01 January 2023 \n".
                                
                pre_index = page_text.find('day') + 3 # we cant be sure that the pdf if correctly read 100% so this should be quite generic
                post_index = page_text.find('Selected')
                # Get the date between the two token variables and add to the df
                date = re.sub(r'[^a-zA-Z0-9]', '', page_text[pre_index:post_index]) 
                formatted_date = re.sub(r'(\d{1,2})(\s*)([A-Za-z]+)(\s*)(\d{3,4})', r'\1 \3 \5', date) # make sure there are spaces between dd MM yyyy
                print(f"\t{formatted_date}")
                t_df = t_df.assign(Date = pd.Series([formatted_date for i in range(len(t_df))]).values)
                t_df = t_df[['Date'] + columns] # reorder the columns to match df

                df = pd.concat([df, t_df], ignore_index = True, sort = False)
            # df.to_csv(met_file[:-4]+'.csv')
            df.to_sql("met_office", con=conn, if_exists='append', index=False)

Processing ../data/landing/persistent\Metoffice_01_22_2023-10-13.pdf
	01 January 2022
	02 January 2022
	03 January 2022
	04 January 2022
	05 January 2022
	06 January 2022
	07 January 2022
	08 January 2022
	09 January 2022
	10 January 2022
	11 January 2022
	12 January 2022
	13 January 2022
	14 January 2022
	15 January 2022
	16 January 2022
	17 January 2022
	18 January 2022
	19 January 2022
	20 January 2022
	21 January 2022
	22 January 2022
	23 January 2022
	24 January 2022
	25 January 2022
	26 January 2022
	27 January 2022
	28 January 2022
	29 January 2022
	30 January 2022
	31 January 2022
Processing ../data/landing/persistent\Metoffice_01_23_2023-10-13.pdf
	01 January 2023
	02 January 2023
	03 January 2023
	04 January 2023
	05 January 2023
	06 January 2023
	07 January 2023
	08 January 2023
	09 January 2023
	10 January 2023
	11 January 2023
	12 January 2023
	13 January 2023
	14 January 2023
	15 January 2023
	16 January 2023
	17 January 2023
	18 January 2023
	19 January 2023
	20 January 