In [1]:
# packages
import pandas as pd
import camelot
import PyPDF2
import os
import glob
import re
import sqlite3


In [11]:
def process_met_pdf(file_path):
    # This is the text at the top of the page where the weather tables are. We use it to know which pages we want to read
    search_string = 'Selected UK readings at (L) 0000 and (R) 1200 UTC'
    pattern = re.compile(r'\s*'.join(re.escape(word) for word in search_string.split())) # the whitespace can be read incorrectly so we allow for optional whitespace
    # This is the columns without the date. When we need the date we append it with ['Date'] + columns
    columns = ['Station_no', 'Station_name',
            '0000_PRESS', '0000_WDIR', '0000_WSPD', '0000_CLOUD', '0000_TEMP', '0000_TDEW',
            '1200_PRESS', '1200_WDIR', '1200_WSPD', '1200_CLOUD', '1200_TEMP', '1200_TDEW']

    with open(file_path, 'rb') as pdf_raw:
        pdf = PyPDF2.PdfReader(pdf_raw)

        df = pd.DataFrame(columns=['Date']+columns)
        i = 0
        while i < len(pdf.pages):
            page_text = pdf.pages[i].extract_text()
            if re.search(pattern, page_text):
                table = camelot.read_pdf(file_path, pages=str(i+1))

                t_df = table[0].df
                t_df = t_df.iloc[2:]
                t_df.columns = columns

                # The top of the page is always:  "Daily Weather Summary for Sunday 01 January 2023 \n".
                                
                pre_index = page_text.find('day') + 3 # we cant be sure that the pdf if correctly read 100% so this should be quite generic
                post_index = page_text.find('Selected')
                # Get the date between the two token variables and add to the df
                date = re.sub(r'[^a-zA-Z0-9]', '', page_text[pre_index:post_index]) 
                formatted_date = re.sub(r'(\d{1,2})(\s*)([A-Za-z]+)(\s*)(\d{3,4})', r'\1 \3 \5', date) # make sure there are spaces between dd MM yyyy
                print(f"\t{formatted_date}")
                t_df = t_df.assign(Date = pd.Series([formatted_date for i in range(len(t_df))]).values)
                t_df = t_df[['Date'] + columns] # reorder the columns to match df

                df = pd.concat([df, t_df], ignore_index = True, sort = False)

                # we know there are at least 7 pages between tables
                i += 6
            i += 1
           
        # Return the df
        return df

In [12]:
data_path="../data/landing/persistent/*"
# TODO add some check that the file in persistent has not been converted into a table
# connect to the formatted zone database
conn = sqlite3.connect('../data/formatted_zone/formatted_zone.db')
for file in glob.glob(data_path): 
    table_name = file.split("\\")[-1][:-4]
    print(f"Processing: {table_name}")
    # only move .csv and .pdf files to a table
    if file.split(".")[-1] == "csv": 
        df = pd.read_csv(file)
        df.to_sql(table_name, con=conn, if_exists='replace', index=False)

    if file.split(".")[-1] == "pdf": 
        # this will create a new table per pdf file
        df = process_met_pdf(file)
        df.to_sql(table_name, con=conn, if_exists='replace', index=False)

# close the connection
conn.close()

Processing: cleaned_merged_seasons_2023-10-13


  df = pd.read_csv(file)


Processing: football-data_2223_2023-10-13
Processing: master_team_list_2023-10-13
Processing: Metoffice_01_22_2023-10-13
	01 January 2022
	02 January 2022
	03 January 2022
	04 January 2022
	05 January 2022
	06 January 2022
	07 January 2022
	08 January 2022
	09 January 2022
	10 January 2022
	11 January 2022
	12 January 2022
	13 January 2022
	14 January 2022
	15 January 2022
	16 January 2022
	17 January 2022
	18 January 2022
	19 January 2022
	20 January 2022
	21 January 2022
	22 January 2022
	23 January 2022
	24 January 2022
	25 January 2022
	26 January 2022
	27 January 2022
	28 January 2022
	29 January 2022
	30 January 2022
	31 January 2022
Processing: Metoffice_01_23_2023-10-13
	01 January 2023
	02 January 2023
	03 January 2023
	04 January 2023
	05 January 2023
	06 January 2023
	07 January 2023
	08 January 2023
	09 January 2023
	10 January 2023
	11 January 2023
	12 January 2023
	13 January 2023
	14 January 2023
	15 January 2023
	16 January 2023
	17 January 2023
	18 January 2023
	19 Jan

In [None]:
import pandas as pd

In [40]:
# CHECK THE TABLES ON THE DB
conn = sqlite3.connect('../data/formatted_zone/formatted_zone.db')
c = conn.cursor()
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = c.fetchall()
for table in tables: 
    print(table[0])
    df = pd.read_sql_query(f"SELECT * FROM \"{table[0]}\";", conn)
    print(df.describe())
conn.close()

cleaned_merged_seasons_2023-10-13
            assists         bonus           bps  clean_sheets    creativity  \
count  96169.000000  96169.000000  96169.000000  96169.000000  96169.000000   
mean       0.041042      0.108715      6.080275      0.106739      4.734000   
std        0.215378      0.492334      9.850595      0.308783     10.689879   
min        0.000000      0.000000    -21.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000     10.000000      0.000000      2.400000   
max        4.000000      3.000000    128.000000      1.000000    170.900000   

            element       fixture  goals_conceded  goals_scored     ict_index  \
count  96169.000000  96169.000000    96169.000000  96169.000000  96169.000000   
mean     327.063264    198.413824        0.495887      0.045514      1.715793   
std      19