In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [3]:
df = pd.read_csv('2018_features.csv', index_col='timestamp', parse_dates=True)

In [52]:
df = df[df.index < '2018-06-28']

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12658972 entries, 2018-01-01 00:00:02 to 2018-06-27 23:58:02
Data columns (total 9 columns):
station_id     int64
bikes          int64
spaces         int64
total_docks    int64
bike_lag       int64
hour           int64
day_of_week    int64
month          int64
deposit        int64
dtypes: int64(9)
memory usage: 965.8 MB


In [48]:
validation_set = df[df.index > '2018-06-28']

In [54]:
validation_set.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 54936 entries, 2018-06-28 00:00:03 to 2018-06-28 16:46:02
Data columns (total 9 columns):
station_id     54936 non-null int64
bikes          54936 non-null int64
spaces         54936 non-null int64
total_docks    54936 non-null int64
bike_lag       54936 non-null int64
hour           54936 non-null int64
day_of_week    54936 non-null int64
month          54936 non-null int64
deposit        54936 non-null int64
dtypes: int64(9)
memory usage: 4.2 MB


In [89]:
validation_set.to_csv('2017-06-25.csv')

In [90]:
confirm = validation_set['deposit']

In [91]:
validation_to_set = validation_set.drop('deposit', axis=1)

In [55]:
X = df.drop('deposit', axis=1)
y = df['deposit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [60]:
etr = ExtraTreesRegressor(n_estimators=50, n_jobs=-1, random_state=0)

In [61]:
model = etr.fit(X_train, y_train)

In [62]:
predictions = model.predict(X_test)

In [63]:
print(predictions)

[ 0. -1.  0. ...  0.  0.  0.]


In [64]:
etr.score(predictions, y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[ 0. -1.  0. ...  0.  0.  0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
importances = etr.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1][:20]

In [None]:
print(etr.feature_importances_)

In [None]:
X_test.columns

In [None]:


plt.figure(figsize=(14,10))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices],
       color="b", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), indices)
plt.xlim([-1, len(indices)])
plt.show()



In [33]:
from sklearn.model_selection import  cross_val_score

In [34]:
results = cross_val_score(etr, X_train, y_train, cv=5, scoring='r2')

In [35]:
print("ExtraTrees score: %.4f (%.4f)" % (results.mean(), results.std()))

ExtraTrees score: 0.9997 (0.0002)


In [22]:
etr.predict(validate_model)

array([-4.,  0., -1.])

In [39]:
df.cov()

Unnamed: 0,station_id,bikes,spaces,total_docks,bike_lag,hour,day_of_week,month,deposit
station_id,915.724505,42.961003,79.11805,122.855322,42.961085,-0.019525,0.030966,2.049063,-8e-06
bikes,42.961003,125.941056,-103.880218,22.133717,125.741405,-1.596171,0.503723,-0.799205,0.199643
spaces,79.11805,-103.880218,138.540129,34.093692,-103.680775,1.575717,-0.477138,1.069336,-0.199435
total_docks,122.855322,22.133717,34.093692,56.485592,22.13354,-0.006431,0.007785,0.274432,0.000178
bike_lag,42.961085,125.741405,-103.680775,22.13354,125.94106,-1.618719,0.5037,-0.799111,-0.199637
hour,-0.019525,-1.596171,1.575717,-0.006431,-1.618719,47.91,0.09816,-0.032127,0.022565
day_of_week,0.030966,0.503723,-0.477138,0.007785,0.5037,0.09816,4.010077,0.056088,2.7e-05
month,2.049063,-0.799205,1.069336,0.274432,-0.799111,-0.032127,0.056088,2.97975,-9.1e-05
deposit,-8e-06,0.199643,-0.199435,0.000178,-0.199637,0.022565,2.7e-05,-9.1e-05,0.39928


In [41]:
df.corr()

Unnamed: 0,station_id,bikes,spaces,total_docks,bike_lag,hour,day_of_week,month,deposit
station_id,1.0,0.126505,0.222129,0.540185,0.126505,-9.3e-05,0.000511,0.039227,-4.151649e-07
bikes,0.1265052,1.0,-0.786432,0.262423,0.998415,-0.020549,0.022415,-0.041256,0.02815349
spaces,0.2221291,-0.786432,1.0,0.385405,-0.784922,0.019341,-0.020243,0.05263,-0.02681481
total_docks,0.5401852,0.262423,0.385405,1.0,0.262421,-0.000124,0.000517,0.021153,3.75482e-05
bike_lag,0.1265054,0.998415,-0.784922,0.262421,1.0,-0.020839,0.022414,-0.041251,-0.02815259
hour,-9.321655e-05,-0.020549,0.019341,-0.000124,-0.020839,1.0,0.007082,-0.002689,0.005159149
day_of_week,0.0005110012,0.022415,-0.020243,0.000517,0.022414,0.007082,1.0,0.016226,2.162989e-05
month,0.03922681,-0.041256,0.05263,0.021153,-0.041251,-0.002689,0.016226,1.0,-8.310203e-05
deposit,-4.151649e-07,0.028153,-0.026815,3.8e-05,-0.028153,0.005159,2.2e-05,-8.3e-05,1.0


In [42]:
df.mad()

station_id     26.151989
bikes           9.515925
spaces          9.921847
total_docks     6.342524
bike_lag        9.515925
hour            5.999742
day_of_week     1.725882
month           1.520204
deposit         0.195054
dtype: float64

In [92]:
well = model.predict(validation_to_set)

In [76]:
type(confirm)

pandas.core.series.Series

In [83]:
type(well)

numpy.ndarray

In [94]:
well = pd.DataFrame(well, dtype='str', index=well)

In [107]:
well.to_csv('well.csv')

In [101]:
vs = pd.read_csv('b4bedtest.csv', index_col='timestamp', parse_dates=True)

In [102]:
confirm = vs['deposit']

In [103]:
validation_to_set = vs.drop('deposit', axis=1)

In [104]:
predictions = model.predict(validation_to_set)

In [106]:
well = pd.DataFrame(predictions, dtype='str', index=predictions)