# Investigate missing Melbourne sensor footfall data 

#### Code initialisation

In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from datetime import date, datetime

In [1]:
def check_sensor_data_completeness(sensor_number):
    
    # Get data for just one sensor
    one_sensor =  location_counts[location_counts.sensor_id == sensor_number]

    # Go for only years after 2010 where messed up data is
    one_sensor = one_sensor[one_sensor.year >2010]
    one_sensor = one_sensor.sort_values(by=['datetime'])
    
    # Save
    yearly_missing_vals = {}
    
    # Loop through each year, find the number of hourly values there are and compare this against
    # the number of values there should be in that year (calculated using daterange and the first and last
    # hour in the year)
    for year in np.sort(one_sensor.year.unique()):
        one_year = one_sensor[one_sensor.year==year]
        vals_this_year = len(one_year)
        expected_vals_this_year = len(pd.date_range(date(year, 1, 1), datetime(year, 12, 31, 23), freq = 'H'))
        yearly_missing_vals[year] = expected_vals_this_year - vals_this_year
    
    # Convert to dataframe
    yearly_missing_vals_df = pd.DataFrame.from_dict(yearly_missing_vals, orient='index', columns = [sensor_number])
    
    return yearly_missing_vals_df

def count_number_missing_blocks(sensor_number):
    this_sensor = []
    for year in location_counts.year.unique()[2:]:
        # Get data for just one sensor, in just one year (where there was just one missing value)
        one_sensor = location_counts[location_counts.sensor_id == sensor_number].copy()
        one_sensor_one_yr = one_sensor[one_sensor.year == year].copy()
        one_sensor_one_yr.reset_index(inplace=True)

        if len(one_sensor_one_yr) == 0:
            this_sensor.append(np.nan)
        else:
            # Create a new column listing the timedifference between each row and the previous row
            one_sensor_one_yr['timediff'] = one_sensor_one_yr['datetime'].diff().apply(lambda x: x/np.timedelta64(1, 'm')).fillna(0).astype('int64')

            # Set this value for the first row using a timestamp for the first Jan
            first_jan = datetime(year = year, month = 1, day=1, hour=0, minute=0, second=0)
            # If the first row is the first of Jan, then set the timediff to be 60 so this doesnt flag as a mising block of data
            if one_sensor_one_yr['datetime'].loc[0] == first_jan:
                one_sensor_one_yr.at[one_sensor_one_yr.index[0], 'timediff'] = 60
            # If it's not the fist_jan, then set it to how many hours are between that timestamp and the first Jan
            else:
                one_sensor_one_yr.at[one_sensor_one_yr.index[0], 'timediff'] = (one_sensor_one_yr['datetime'].loc[0]-first_jan).days*24                      

            # Find the number rows where the time difference with the previous row doesn't equal 60 
            number_blocks_of_rows = len(one_sensor_one_yr.loc[one_sensor_one_yr['timediff'] != 60])

            # Check the last row is the 23rd hour of 31st December
            # If its not, then add 1 to the number of missing rows of data
            last_dec = datetime(year = year, month = 12, day=31, hour=23, minute=0, second=0)  
            if one_sensor_one_yr['datetime'].loc[len(one_sensor_one_yr)-1] != last_dec:
                number_blocks_of_rows+=1

            # Add the number missing rows to the list for this sensor    
            this_sensor.append(number_blocks_of_rows)
            
    return this_sensor

#### Import pedestrian count data

In [3]:
location_counts = pd.read_csv("../Cleaned_data/allsensors.csv")

NameError: name 'pd' is not defined

# Investigate missing data

#### List the number of missing hours in each year of data for each sensor

In [53]:
# Inititalise dataframe to store the number of missing values in each year across all sensors
# and the number of blocks in which the missing values are located
missing_vals_per_year_all_sensors = pd.DataFrame(None)
missing_vals_block_sizes = pd.DataFrame(None)

# Get a list of sorted sensor numbers
sensor_numbers_sorted =  sorted(location_counts['sensor_id'].unique().tolist())

# Loop through each sensor
for sensor_number in sensor_numbers_sorted:
    # Return dataframes containing the number of missing values/blocks of missing vals in each year
    missing_vals_per_year_this_sensor = check_sensor_data_completeness(sensor_number)
    missing_vals_block_sizes_this_sensor =  count_number_missing_blocks(sensor_number)
    
    # Add to dataframes containing values for all sensors
    missing_vals_block_sizes[sensor_number] = missing_vals_block_sizes_this_sensor
    missing_vals_per_year_all_sensors[sensor_number] = missing_vals_per_year_this_sensor
    
    # Set index to year names
    missing_vals_block_sizes.index = location_counts.year.unique()
    missing_vals_per_year_all_sensors.index = location_counts.year.unique()

ValueError: Length mismatch: Expected axis has 10 elements, new values have 12 elements

### Find the number of sensors with no data in each year

In [None]:
missing_vals_per_year_all_sensors.isna().sum(axis = 1)

for sensor_number in sensor_numbers_sorted:
# missing_vals_per_year_all_sensors.loc[:,1]  
for sensor_number in sensor_numbers_sorted:
    if (missing_vals_per_year_all_sensors[sensor_number] == 0).all() == True:
        print("Full data all years")

### Find sensors which don't have any years with absoloutly no data

In [None]:
# missing_vals_per_year_all_sensors[missing_vals_per_year_all_sensors.columns[~missing_vals_per_year_all_sensors.isnull().any()]]

#### Check that where there are 0s in the  missing_blocks_sizes_df that there is also a 0 in the dataframe with the number of missing vals

In [None]:
# Set values over or equal to 1 in both dataframes
missing_block_or_not = missing_vals_block_sizes.copy()
missing_block_or_not[missing_block_or_not >= 1] = 1
missing_vals_or_not = missing_vals_per_year_all_sensors.copy()
missing_vals_or_not[missing_vals_or_not >= 1] = 1

# Check if they are the same
equality  = missing_block_or_not.eq(missing_vals_or_not)|(missing_block_or_not.isna()&missing_vals_or_not.isna())
print(np.all(equality))

In [None]:
missing_vals_per_year_all_sensors

In [None]:
missing_vals_block_sizes

### Find rows where next date is not one day later

In [None]:
# Get data for just one sensor
one_sensor =  location_counts[location_counts.sensor_id ==2].copy()
one_sensor_one_year = one_sensor[one_sensor.year==2013].copy()
one_sensor_one_year.reset_index(inplace = True, drop = True)

one_sensor_one_year['timediff'] = one_sensor_one_year['datetime'].diff().apply(lambda x: x/np.timedelta64(1, 'm')).fillna(0).astype('int64')
one_sensor_one_year[one_sensor_one_year['timediff'] != 60]

idx_of_row = one_sensor_one_year[one_sensor_one_year['timediff'] != 60].index
df = pd.DataFrame(None)
for i in range(0,len(idx_of_row)):
    idx = idx_of_row[i]
    if idx == 0:
        rows = one_sensor_one_year.loc[[idx,idx+1]]
    else:
        rows = one_sensor_one_year.loc[[idx-1,idx,idx+1]]
    df = df.append(rows)
df

### Specific datetime with glitch in lots of sensors

In [None]:
for sensor_num in sensor_numbers_sorted:

    one_sensor =  location_counts[location_counts.sensor_id ==sensor_num].copy()
    one_sensor_one_year = one_sensor[one_sensor.year==year].copy()
    one_sensor_one_year.reset_index(inplace = True, drop = True)
    one_sensor_one_year
    one_sensor_one_year['timediff'] = one_sensor_one_year['datetime'].diff().apply(lambda x: x/np.timedelta64(1, 'm')).fillna(0).astype('int64')
    first_jan = datetime(year = year, month = 1, day=1, hour=0, minute=0, second=0)
    # If the first row is the first of Jan, then set the timediff to be 60 so this doesnt flag as a mising block of data
    if one_sensor_one_year['datetime'].loc[0] == first_jan:
        one_sensor_one_year.at[one_sensor_one_year.index[0], 'timediff'] = 60    
    missing_vals_near = one_sensor_one_year[one_sensor_one_year['timediff'] != 60]
    missing_vals_near    

    print(missing_vals_near['datetime'])

### Looking at trends in sensor counts
Look to see which sensors have valid records for a full calendar year at a time. Some sensors were added at later years, and some stop working at points throughout the year