In [1]:
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient
from io import BytesIO
import os
import sys
import yaml
import pandas as pd

#root_path = os.path.abspath(os.path.join(os.getcwd(), '.'))
#sys.path.append(root_path)

#from utils.data_collection import new_data_check, preprocess_df, make_train_test

# Don't forget to run ``az login`` in the command prompt and authenticate!

with open("../config.yml", "r") as f:
    config = yaml.safe_load(f)

ACCOUNT_URL = config['data']['account_url']
CONTAINER_NAME = config['data']['azure_container_name']
ONLINE_ADDRESS = config['data']['online_address']
DATA_PATH = config['data']['current_table']
OLD_DATA_PATH = config['data']['path_old_data']
default_credential = DefaultAzureCredential()

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient(ACCOUNT_URL,
                                        credential=default_credential)
container_client = blob_service_client.get_container_client(CONTAINER_NAME)

In [2]:
blob_list = container_client.list_blobs()
files = []
for blob in blob_list:
    files.append(blob.name)

In [3]:
len(files)

3

In [4]:
import re

results_oldies_search = []
results_data_path_search = []
oldies_paths = re.compile('oldies/')
data_path = re.compile(f'{DATA_PATH}')
older_dataframes_paths = []
for file in files:
    result_oldies = oldies_paths.search(file)
    if result_oldies is not None:
        older_dataframes_paths.append(file)
    result_data_path = data_path.search(file)
    results_oldies_search.append(result_oldies)
    results_data_path_search.append(result_data_path)

In [5]:
len([x for x in results_data_path_search if x is not None]) == 1

True

In [6]:
older_dataframes_paths

['oldies/table_2020-05-01-2023-04-30.parquet']

In [7]:
older_dataframes = []
for old_pd in older_dataframes_paths:
    df = pd.read_parquet(BytesIO(container_client.download_blob(old_pd).readall()))
    older_dataframes.append(df)

current_table = pd.read_parquet(BytesIO(container_client.download_blob(DATA_PATH).readall()))


In [8]:
df.shape

(340174, 39)

In [9]:
current_table.shape

(340174, 39)

In [10]:
for df in older_dataframes:
    assert set(df['IncidentNumber'])\
        .intersection(set(current_table['IncidentNumber'])) == set(
        df['IncidentNumber'])

    assert set(df.columns) == set(current_table.columns)
    assert len(df['IncidentNumber'].unique()) <= len(
        current_table['IncidentNumber'].unique())

In [11]:
current_table

Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,...,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpHoursRoundUp,Notional Cost (£),NumCalls
0,051451-01052020,01 May 2020,2020,00:03:18,0,False Alarm,False alarm - Good intent,,Dwelling,Self contained Sheltered Housing,...,180.0,Lambeth,192.0,Lambeth,2.0,3.0,3.0,2.0,692.0,1.0
1,051452-01052020,01 May 2020,2020,00:03:32,0,False Alarm,False alarm - Good intent,,Dwelling,House - single occupancy,...,293.0,Stratford,420.0,Leytonstone,2.0,2.0,2.0,1.0,346.0,1.0
2,051453-01052020,01 May 2020,2020,00:05:55,0,Special Service,Special Service,Effecting entry/exit,Dwelling,House - single occupancy,...,393.0,Bromley,,,1.0,1.0,1.0,1.0,346.0,2.0
3,051455-01052020,01 May 2020,2020,00:25:06,0,Special Service,Special Service,Hazardous Materials incident,Dwelling,Purpose Built Flats/Maisonettes - 10 or more s...,...,338.0,Millwall,422.0,Shadwell,2.0,2.0,2.0,2.0,692.0,1.0
4,051456-01052020,01 May 2020,2020,00:28:07,0,False Alarm,AFA,,Dwelling,House - single occupancy,...,233.0,Ealing,364.0,Acton,2.0,2.0,2.0,2.0,692.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340169,060314-30042023,30 Apr 2023,2023,23:42:08,23,Special Service,Special Service,Animal assistance incidents,Road Vehicle,Car,...,523.0,Chiswick,,,1.0,1.0,1.0,1.0,,1.0
340170,060315-30042023,30 Apr 2023,2023,23:44:20,23,Fire,Secondary Fire,,Outdoor Structure,Small refuse/rubbish container,...,1008.0,Kingston,,,1.0,1.0,1.0,1.0,,1.0
340171,060316-30042023,30 Apr 2023,2023,23:44:25,23,False Alarm,AFA,,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,...,254.0,Barnet,,,1.0,1.0,1.0,1.0,,1.0
340172,060317-30042023,30 Apr 2023,2023,23:50:13,23,Special Service,Special Service,Assist other agencies,Dwelling,Converted Flat/Maisonettes - 3 or more storeys,...,299.0,Ealing,,,1.0,1.0,1.0,1.0,,1.0


In [12]:
from pathlib import Path

train_path = Path("../data/train.pkl")
test_path = Path("../data/test.pkl")

assert train_path.is_file()
assert test_path.is_file()

train = pd.read_pickle(train_path)
test = pd.read_pickle(test_path)
assert ((pd.to_datetime(current_table['DateOfCall']).max() -
            test['DateOfCall']).dt.days).max() <= 365
assert ((pd.to_datetime(train['DateOfCall']).max() -
            train['DateOfCall']).dt.days).max() >= 366
assert ((train['DateOfCall'].max() - train['DateOfCall']).dt.days).max() <= 1095

In [13]:
os.getcwd()

'c:\\Users\\marcv\\OneDrive\\MLOps\\mlops_fire_fighter\\experiments'

In [14]:
train["DateOfCall"].max()

Timestamp('2022-12-31 00:00:00')

In [15]:
train["DateOfCall"].min()

Timestamp('2020-05-01 00:00:00')