In [9]:
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pykrige.ok import OrdinaryKriging
from pykrige.rk import RegressionKriging
import sys

sys.path.insert(0, "../MelbourneAnalysis/3. Modelling")
from Functions import *

In [15]:
buffer_size_m = 400

### Read in sensor data (keep data only post 2010 and pre 2020)

In [3]:
# Read in data
sensors = pd.read_csv('../MelbourneAnalysis/Cleaned_data/SensorData/allsensors.csv')
# Create month as number not string
sensors['datetime'] =pd.to_datetime(sensors['datetime'], format = '%Y-%m-%d %H:%M:%S')#dayfirst = False)
# Keep only data from 2011 onwards
sensors= sensors[sensors['year']>2010]
sensors= sensors[sensors['year']<2020]

### Remove outliers

In [10]:
# Remove outliers
sensors, outliers = remove_outliers(sensors)
# Drop unneeded columns
sensors=sensors.drop(['Latitude', 'Longitude', 'location', 'mdate'], axis=1)
# Check the data
sensors.head()

I found 60787 outliers from 2712150 days in total. Removing them leaves us with 2651363 events


Unnamed: 0,sensor_id,datetime,year,month,day,time,hourly_counts
0,8,2011-01-01,2011,1,Saturday,0,325
1,2,2011-01-01,2011,1,Saturday,0,985
2,3,2011-01-01,2011,1,Saturday,0,2547
3,15,2011-01-01,2011,1,Saturday,0,1581
4,11,2011-01-01,2011,1,Saturday,0,2767


### Find the mean daily average value for each sensor (across whole time period)

In [11]:
daily_means= []
for sensor_id in sensors['sensor_id'].unique():
    # Get the value just for this sensor
    this_sensor = sensors[sensors['sensor_id']==sensor_id]
    # Get the value for each day 
    hourly_vals = this_sensor.groupby('datetime', as_index=False)['hourly_counts'].sum()    
    # Get the total value for each day
    hourly_vals['datetime'] = pd.to_datetime(hourly_vals['datetime'])
    daily_vals = hourly_vals.groupby(by=pd.Grouper(freq='D', key='datetime')).sum()
    # Get the mean daily value
    daily_mean = daily_vals['hourly_counts'].mean()
    #print(daily_mean)
    daily_means.append(daily_mean)

In [12]:
sensor_daily_means = pd.DataFrame({'sensor_id':sensors['sensor_id'].unique(), 'DailyMean': daily_means})
sensor_daily_means#.head()

Unnamed: 0,sensor_id,DailyMean
0,8,3850.860663
1,2,19708.038333
2,3,27519.236994
3,15,18026.084880
4,11,2195.141162
...,...,...
57,58,19270.831510
58,59,6752.428571
59,60,17129.875000
60,61,12812.440217


### Add the other variables

In [16]:
sensor_daily_means_with_features = sensor_daily_means.merge(pd.read_csv('../MelbourneAnalysis/Cleaned_data/SpatialFeatures/sensors_betweenness.csv'),how='left', on='sensor_id')
sensor_daily_means_with_features = sensor_daily_means_with_features.merge(pd.read_csv('../MelbourneAnalysis/Cleaned_data/FeaturesNearSensors/num_features_near_sensors_{}.csv'.format(buffer_size_m), index_col=0) ,how='left', on='sensor_id')
# sensor_daily_means_with_features = sensor_daily_means.merge(pd.read_csv('../Cleaned_data/FeaturesNearSensors/feature_subtypes_near_sensors_{}.csv'.format(buffer_size_m), index_col=0) ,how='left', on='sensor_id')
sensor_daily_means_with_features = sensor_daily_means_with_features.merge(pd.read_csv("../MelbourneAnalysis/Data/FootfallData/melbourne_locations.csv",
                                                                        index_col=0)[['Latitude', 'Longitude']] ,how='left', on='sensor_id')

In [17]:
sensor_daily_means_with_features.drop(['avg_n_floors_2011', 'avg_n_floors_2012', 'avg_n_floors_2013','avg_n_floors_2014',
                                      'avg_n_floors_2015', 'avg_n_floors_2016', 'avg_n_floors_2017', 'avg_n_floors_2018',
                                      'avg_n_floors_2020', 'buildings_2011', 'buildings_2012', 'buildings_2013',
                                      'buildings_2014', 'buildings_2015', 'buildings_2016', 'buildings_2017',
                                      'buildings_2018', 'buildings_2020', 'avg_n_floors_2010', 'buildings_2010'],
                                      axis=1, inplace=True)
sensor_daily_means_with_features.head()

Unnamed: 0,sensor_id,DailyMean,betweenness,lights,street_inf,bikes,landmarks,memorials,trees,transport_stops,bus-stops,tram-stops,metro-stations,taxi-ranks,big-car-parks,buildings_2019,avg_n_floors_2019,Latitude,Longitude
0,8,3850.860663,0.024056,2335,793,3,6,2,2175,2,0,2,0,0,4,29,11.172414,-37.822935,144.947175
1,2,19708.038333,0.041181,551,1557,2,16,20,454,1,1,0,0,18,7,373,5.86059,-37.813807,144.965167
2,3,27519.236994,0.0666,498,1331,3,16,10,455,4,0,3,1,11,9,346,6.106936,-37.811015,144.964295
3,15,18026.08488,0.01496,501,1290,3,15,11,469,4,0,3,1,11,7,323,6.517028,-37.810644,144.964471
4,11,2195.141162,7.4e-05,1205,556,1,6,1,778,2,0,2,0,1,10,36,7.166667,-37.81565,144.939707


In [18]:
sensor_daily_means_with_features.to_csv("Data/sensors_daily_means_with_features.csv", index=False)