In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error

### Reading data

In [2]:
df=pd.read_csv('./data/train.csv',sep=',',parse_dates=True, index_col=0)
df.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


### Train/test split

In [3]:
X=df.drop(['registered','casual','count'],axis=1)
y1=df['registered']
y2=df['casual']
y=df['count']

In [4]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, random_state=0, test_size=0.2)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, random_state=0, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

### Feature engineering

In [5]:
def feature_eng(df):
    df = df.copy()
    df['hour']=df.index.hour
    df['month']=df.index.month          
    df['temp_binned']=pd.cut(df['temp'], 4, precision=1, labels=[1, 2, 3,4])
    df['humidity_binned']=pd.cut(df['humidity'], 12, precision=1, labels=[1, 2, 3,4,5,6,7,8,9,10,11,12])
    df['windspeed_binned']=pd.cut(df['windspeed'], 5, precision=1, labels=[1, 2, 3,4,5])
    return df

In [6]:
X1_train_fe = feature_eng(X1_train)
X1_test_fe = feature_eng(X1_test)
X2_train_fe = feature_eng(X2_train)
X2_test_fe = feature_eng(X2_test)
X_train_fe = feature_eng(X_train)
X_test_fe = feature_eng(X_test)

### Polynomial Features

In [7]:
def poly(X_train_fe):
    pt = PolynomialFeatures(interaction_only=True)
    X_train_fe.reset_index(inplace=True)
    Xp = X_train_fe[['humidity','temp','windspeed']]
    p_features = pt.fit_transform(Xp)
    df_poly=pd.DataFrame(p_features, columns=pt.get_feature_names())
    X_train_fe=X_train_fe.join(df_poly)
    X_train_fe.set_index('datetime', inplace=True)
    return X_train_fe

In [8]:
X1_train_fep = poly(X1_train_fe)
X1_test_fep = poly(X1_test_fe)
X2_train_fep = poly(X2_train_fe)
X2_test_fep = poly(X2_test_fe)
X_train_fep = poly(X_train_fe)
X_test_fep = poly(X_test_fe)

### Selecting features

In [9]:
X1_train_fep.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'hour', 'month', 'temp_binned',
       'humidity_binned', 'windspeed_binned', '1', 'x0', 'x1', 'x2', 'x0 x1',
       'x0 x2', 'x1 x2'],
      dtype='object')

In [10]:
def select(df):
    df = df.copy()
    df.drop(columns=['1','x0','x1','holiday','season','humidity','windspeed','temp','atemp','1', 'x0', 'x1', 'x2'], axis=1, inplace=True)
#     if set(['count']).issubset(df.columns):
#         df.drop(columns=['count'],axis=1, inplace=True)
#     else:
#         return df
    return df

In [11]:
X1_train_feps = select(X1_train_fep)
X1_test_feps = select(X1_test_fep)
X2_train_feps = select(X2_train_fep)
X2_test_feps = select(X2_test_fep)
X_train_feps = select(X_train_fep)
X_test_feps = select(X_test_fep)

### Random Forest Regressor

In [12]:
rf1=RandomForestRegressor(max_depth=8,n_estimators=300)
rf2=RandomForestRegressor(max_depth=8,n_estimators=300)
rf3=RandomForestRegressor(max_depth=8,n_estimators=300)

In [13]:
rf1.fit(X1_train_feps,y1_train)
rf2.fit(X2_train_feps,y2_train)
rf3.fit(X_train_feps,y_train)

RandomForestRegressor(max_depth=8, n_estimators=300)

In [14]:
rf1.score(X1_train_feps, y1_train)

0.8358168102622233

In [15]:
rf1.score(X1_test_feps, y1_test)

0.8232077830374682

In [16]:
rf2.score(X2_train_feps, y2_train)

0.8473700832878018

In [17]:
rf2.score(X2_test_feps, y2_test)

0.827575070039604

In [18]:
rf3.score(X_train_feps, y_train)

0.8201510163673891

In [19]:
rf3.score(X_test_feps, y_test)

0.8051836725391428

In [20]:
y1_pred=rf1.predict(X1_train_feps)

In [21]:
y2_pred=rf2.predict(X2_train_feps)

In [22]:
ypred=y1_pred+y2_pred  

In [23]:
ypred

array([407.913758  , 635.8995764 ,  84.34958251, ..., 481.87294427,
       184.80624305, 444.99432728])

In [24]:
np.sqrt(mean_squared_log_error(ypred, y_train))    

0.4147567133560897

### Kaggle

In [25]:
X_kaggle= pd.read_csv('./data/test.csv', ',', parse_dates=True, index_col=0)

In [26]:
X_kaggle_fe=feature_eng(X_kaggle)
X_kaggle_fep=poly(X_kaggle_fe)
X_kaggle_feps=select(X_kaggle_fep)
y_kaggle = rf1.predict(X_kaggle_feps)+rf2.predict(X_kaggle_feps)
y_kaggle

array([ 15.41074174,   6.0794099 ,   3.61105667, ..., 165.45652399,
       123.94309148,  88.96199684])

In [27]:
output = pd.DataFrame({'datetime': X_kaggle_feps.index, 'count': y_kaggle})
output.to_csv('./data/bikes.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
