In [1]:
from data_handling import *
from datetime import datetime
import numpy as np

In [2]:
station_path = 'Train/Train/station_201_deploy.csv'

In [3]:
station_data = load_data(station_path)

Dropping features that are redundant for **_individual_** stations (will become important later)

In [4]:
redundant_features = ['station', 'latitude', 'longitude', 'numDocks', 'year']
station_data = station_data.drop(columns=redundant_features)

- Checking for null values in timestamp column
- Ordering values by timestamp

Note: can already see the data includes days from September

In [5]:
station_data = station_data.sort_values(by=['timestamp'])

Displaying station_data again to check the ordering is correct...

In [6]:
station_data.head()

Unnamed: 0,timestamp,month,day,hour,weekday,weekhour,isHoliday,windMaxSpeed.m.s,windMeanSpeed.m.s,windDirection.grades,temperature.C,relHumidity.HR,airPressure.mb,precipitation.l.m2,bikes_3h_ago,full_profile_3h_diff_bikes,full_profile_bikes,short_profile_3h_diff_bikes,short_profile_bikes,bikes
0,1412114000.0,10,1,0,Wednesday,49,0,11.3,3.2,67.5,21.3,85.0,855.3,0.0,,,,,,1.0
1,1412118000.0,10,1,1,Wednesday,50,0,1.6,0.0,157.5,21.1,86.0,1000.6,0.0,,,,,,0.0
2,1412122000.0,10,1,2,Wednesday,51,0,1.6,0.0,112.5,20.9,86.0,880.6,0.0,,,,,,0.0
3,1412125000.0,10,1,3,Wednesday,52,0,0.0,0.0,146.3,20.4,88.0,859.8,0.0,1.0,,,,,0.0
4,1412129000.0,10,1,4,Wednesday,53,0,6.4,3.2,157.5,20.3,87.0,898.1,0.0,0.0,,,,,0.0


In [7]:
assert(are_nulls(station_data, 'timestamp')==False)

In [8]:
def get_timestamp(df, index):
    '''
    Input: dataframe, row index
    Output: day and hour from corresponding timestamp as ints
    '''
    ts = df['timestamp'].iloc[index]
    date = datetime.utcfromtimestamp(ts)
    return date.day, date.hour

In [9]:
assert(are_nulls(station_data, 'temperature.C'))

In [10]:
def get_value(df, feature, index):
    return df[feature].iloc[index]

In [21]:
def next_nonnull(df, feature, index, direction):
    '''
    Input: dataframe, feature (i.e. column), index and direction (forward/backwards)
    Output: next non null feature value
    '''
    if(direction=='forward'):
        for i in range(index + 1, len(df.index)):
            if(np.isnan(get_value(df, feature, i))==False):
                return float(get_value(df, feature, i))
    elif(direction=='backward'):
        for i in range((index-1), 0, -1):
            if(np.isnan(get_value(df, feature, i))==False):
                return float(get_value(df, feature, i))
    else:
        raise Exception("Error: Couldn't find a non-null value")


In [22]:
print(next_nonnull(station_data, 'temperature.C', 1, 'backward'))

None


In [13]:
print(next_nonnull(station_data, 'temperature.C', 1, 'forward'))

20.9


In [353]:
# Still more edge cases to deal with here...

def average_datapoints(df, feature, index, final):
    prev = index - 1
    foll = index + 1
    # check the nan is not the first datapoint
    if(index == 0):
        # if it is, take next non-null value as the nan replacement
        average = next_nonnull(df, feature, foll, 'forward')
    # check the nan is not the final datapoint
    elif(index == final):
        # if it is, take the next non-null value as the nan reaplacement
        average = next_nonnull(df, feature, prev, 'backward')
    else:
        previous = next_nonnull(df, feature, prev, 'backward')
        following = next_nonnull(df, feature, foll, 'forward')
        average = (previous + following) / 2
        
    return average

In [370]:
total_datapoints = len(station_data.index)
value = average_datapoints(station_data, 'temperature.C', 1, total_datapoints)
value

20.75

Some very shoddy testing...

In [354]:
assert(station_data['temperature.C'].loc[4]==20.3)
assert(station_data['temperature.C'].loc[6]==19.6)
assert(average_datapoints(station_data,'temperature.C', 5, len(station_data.index)) == 19.9)

In [355]:
total_datapoints = len(station_data.index)
# get all locations of null values for a given feature
null_locs = get_null_locs(station_data, 'temperature.C')
for index in null_locs:
    feature_value = average_datapoints(station_data, 'temperature.C', index, total_datapoints)
    station_data['temperature.C'].loc[index] = feature_value

In [356]:
station_data['temperature.C'].loc[603]

18.450000000000003

In [357]:
station_data['temperature.C'].loc[602]

18.6

In [308]:
(18.6 + 19.0) / 2

18.8