In [None]:
STATION_PATH = 'Train/Train/station_201_deploy.csv'

In [None]:
from data_handling import *
station_data = load_data(STATION_PATH)

In [None]:
from datetime import datetime
ts = int(station_data.loc[0][4])

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
print('Unix Timestamp:',int(station_data.loc[0][4]))
print('Timestamp conversion:', datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
station_data.head()

In [None]:
station_data.tail()

In [None]:
station_data.info()

___
**RUNNING TODO: Take care of the null values**
___


In [None]:
(station_data['station'].value_counts(),
station_data['longitude'].value_counts(),
station_data['latitude'].value_counts(),
station_data['numDocks'].value_counts(),)

___
**RUNNING TODO: Consider which values are repeated and seperate categorical data**
___

In [None]:
station_data.describe()

In [None]:
train, dev = split_data(station_data)

In [None]:
print(train.shape, dev.shape)

## Linear Regression - Practise Fitting Model

First, make a copy of the training data but drop all columns/features except the 'temperature.C' feature. This should produce a 1-dimensional training set, as below:

In [326]:
train_copy = train[['temperature.C']]
train_copy

Unnamed: 0,temperature.C
0,21.3
1,21.1
3,20.4
4,20.3
6,19.6
...,...
740,20.2
741,20.2
742,20.0
743,19.3


Next, lets check for any null values

In [343]:
def find_null(train_copy, feature):
    print('Null value: {boolean}\nNull value count: {count}'.format(
            boolean=train_copy[feature].isnull().values.any(),
            count=train_copy[feature].isnull().sum()))

In [344]:
find_null(train_copy, 'temperature.C')

Null value: False
Null value count: 0


In [345]:
train_copy['temperature.C']

0      21.3
1      21.1
2      20.4
3      20.3
4      19.6
       ... 
592    20.2
593    20.2
594    20.0
595    19.3
596    19.1
Name: temperature.C, Length: 597, dtype: float64

Let's train a simple imputer to take care of missing values (note: this could be the wrong thing to do, it may be that this data point may need to be removed...)

In [335]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(train_copy[['temperature.C']])
imputer.statistics_

array([21.3])

In [336]:
X = imputer.transform(train_copy[['temperature.C']])
train_copy = pd.DataFrame(X, columns=train_copy.columns)

In [346]:
find_null(train_copy, 'temperature.C')

Null value: False
Null value count: 0


Yay! This seems to have worked...

Now, let's store our labels in a new dataframe

In [338]:
train_copy_labels = train[['bikes']].copy()
train_copy_labels

Unnamed: 0,bikes
0,1.0
1,0.0
3,0.0
4,0.0
6,0.0
...,...
740,0.0
741,2.0
742,7.0
743,9.0


In [348]:
find_null(train_copy_labels, 'bikes')

Null value: True
Null value count: 1


In [351]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_copy_labels[['bikes']])
imputer.statistics_

array([0.])

In [352]:
X = imputer.transform(train_copy_labels[['bikes']])
train_copy_labels = pd.DataFrame(X, columns=train_copy_labels.columns)

In [353]:
find_null(train_copy_labels, 'bikes')

Null value: False
Null value count: 0


Ok, now we have **_imputed_** the labels too and are ready to fit our model.

But first, let's check the shape of the training set and the labels have the same shape

In [358]:
assert(train_copy.shape == train_copy_labels.shape)

In [359]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(train_copy, train_copy_labels)

LinearRegression()

In [371]:
reg.predict([[20]])

array([[4.51839904]])

Now, lets prepare the dev set so we can evaluate performance.

In [372]:
dev_copy = dev[['temperature.C']]
dev_copy

Unnamed: 0,temperature.C
2,20.9
5,20.1
12,22.8
16,23.3
23,22.5
...,...
714,21.9
717,20.1
726,17.9
728,18.3


In [373]:
find_null(dev_copy, 'temperature.C')

Null value: False
Null value count: 0


In [375]:
dev_copy_labels = dev[['bikes']].copy()
dev_copy_labels

Unnamed: 0,bikes
2,0.0
5,0.0
12,7.0
16,10.0
23,3.0
...,...
714,12.0
717,3.0
726,0.0
728,1.0


In [377]:
find_null(dev_copy_labels, 'bikes')

Null value: False
Null value count: 0


In [388]:
preds = reg.predict(dev_copy)
ground_truth = dev_copy_labels['bikes']
print(ground_truth)

2       0.0
5       0.0
12      7.0
16     10.0
23      3.0
       ... 
714    12.0
717     3.0
726     0.0
728     1.0
735     1.0
Name: bikes, Length: 148, dtype: float64


Lets visually compare our predictions and groud truths

In [400]:
dev_copy['temperature.C'][:10]

2     20.9
5     20.1
12    22.8
16    23.3
23    22.5
25    21.6
30    19.6
35    22.7
36    23.3
42    22.6
Name: temperature.C, dtype: float64

In [401]:
preds[:10]

array([[4.41665703],
       [4.50709437],
       [4.20186833],
       [4.14534499],
       [4.23578234],
       [4.33752435],
       [4.56361771],
       [4.213173  ],
       [4.14534499],
       [4.22447767]])

In [396]:
ground_truth[:10]

2      0.0
5      0.0
12     7.0
16    10.0
23     3.0
25     0.0
30     0.0
35     8.0
36     5.0
42     1.0
Name: bikes, dtype: float64

In [None]:
from sklearn.metrics import mean_absolute_error
assert(len(ground_truth) == len(preds))
error = mean_absolute_error(ground_truth, preds)
error

Let's try manually putting in some of the predictions to ensure the results are consistent (slack testing, you'll need to be more rigorous than this going forward...)

In [411]:
reg.predict([[20.1]])

array([[4.50709437]])