In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
# Create logger
import logging

logging.basicConfig(filename='missing_dates.log',
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__) 

# logger.setLevel(logging.CRITICAL)


In [3]:
base_path = '../input/data/'
input_file = 'weatherAUS.csv'
df = pd.read_csv(os.path.join(base_path, input_file))

date_feature = 'Date'
loc_feature = 'Location'

df[date_feature] = pd.to_datetime(df[date_feature])
df.sort_values([date_feature], inplace=True)

In [4]:
# Find first date that a location appears
first_date = df.groupby(loc_feature)[date_feature].min().reset_index()
print(first_date)

            Location       Date
0           Adelaide 2008-07-01
1             Albany 2008-12-01
2             Albury 2008-12-01
3       AliceSprings 2008-12-01
4      BadgerysCreek 2009-01-01
5           Ballarat 2008-12-01
6            Bendigo 2008-12-01
7           Brisbane 2008-07-01
8             Cairns 2008-12-01
9           Canberra 2007-11-01
10             Cobar 2009-01-01
11      CoffsHarbour 2009-01-01
12          Dartmoor 2009-01-01
13            Darwin 2008-07-01
14         GoldCoast 2008-12-01
15            Hobart 2008-07-01
16         Katherine 2013-03-01
17        Launceston 2008-12-01
18         Melbourne 2008-07-01
19  MelbourneAirport 2009-01-01
20           Mildura 2009-01-01
21             Moree 2009-01-01
22      MountGambier 2008-12-01
23       MountGinini 2008-12-01
24         Newcastle 2008-12-01
25              Nhil 2013-03-01
26         NorahHead 2009-01-01
27     NorfolkIsland 2009-01-01
28         Nuriootpa 2009-01-01
29        PearceRAAF 2009-01-01
30      

In [5]:
def find_year_full_data(df, loc_feature='Location', date_feature='Date', unit='D', show_log=False):
    stations = df[loc_feature].unique()
    years = df[date_feature].dt.year.unique()
    for year in years:
        start_date = pd.Timestamp(f'{year}-01-01')
        end_date = pd.Timestamp(f'{year}-12-31')
        full_date_range = pd.date_range(start=start_date, end=end_date, freq=unit)

        station_with_missing_data = []
        for station in stations:
            station_data = df[(df[loc_feature] == station) & (df[date_feature].dt.year == year)]
            actual_dates = station_data[date_feature].dt.date.unique()

            missing_dates = set(full_date_range.date) - set(actual_dates)

            
            if len(missing_dates) == 0:
                if show_log: print(f"All days of the year {year} are present for station {station}.")
                logger.error(f"All days of the year {year} are present for station {station}.")
            else:
                station_with_missing_data.append((station))
                logger.error(f"{len(missing_dates)} days are missing for station {station} in the year {year}.")
                if show_log: print(f"{len(missing_dates)} days are missing for station {station} in the year {year}.")
                # logger.error(f"The following days are missing :")
                # for date in sorted(missing_dates):
                #     if show_log: print(date)
                #     logger.error(date)
        
        if station_with_missing_data:
            print(f"Year {year} has missing data for the following stations: {station_with_missing_data}")
        else:
            print(f"Year {year} has complete data for all stations.")

def select_complete_year(df, year):
    df['year'] = df[date_feature].dt.year.astype(str)
    df_filtered = df[df['year'] == year]

    complete_data = df_filtered.groupby(df_filtered[date_feature].dt.date).filter(
        lambda x: x[loc_feature].nunique() == df_filtered[loc_feature].nunique()
    )

    return complete_data

In [6]:
find_year_full_data(df)

Year 2007 has missing data for the following stations: ['Canberra', 'Sydney', 'Adelaide', 'Hobart', 'Melbourne', 'Brisbane', 'Perth', 'Darwin', 'Albany', 'Newcastle', 'GoldCoast', 'AliceSprings', 'Albury', 'Cairns', 'MountGambier', 'Penrith', 'Ballarat', 'Bendigo', 'Wollongong', 'MountGinini', 'Townsville', 'Tuggeranong', 'Launceston', 'Moree', 'Walpole', 'BadgerysCreek', 'Williamtown', 'MelbourneAirport', 'PerthAirport', 'CoffsHarbour', 'Woomera', 'PearceRAAF', 'Nuriootpa', 'Cobar', 'Sale', 'SydneyAirport', 'WaggaWagga', 'Richmond', 'NorahHead', 'Mildura', 'NorfolkIsland', 'Witchcliffe', 'Dartmoor', 'Watsonia', 'Portland', 'SalmonGums', 'Uluru', 'Nhil', 'Katherine']
Year 2008 has missing data for the following stations: ['Sydney', 'Adelaide', 'Hobart', 'Melbourne', 'Brisbane', 'Perth', 'Darwin', 'Albany', 'Newcastle', 'GoldCoast', 'AliceSprings', 'Albury', 'Cairns', 'MountGambier', 'Penrith', 'Ballarat', 'Bendigo', 'Wollongong', 'MountGinini', 'Townsville', 'Tuggeranong', 'Launceston'