In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [2]:
bike_data = pd.read_csv("bikeshare.csv")

In [3]:
bike_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
bike_data.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [5]:
bike_data.shape

(10886, 12)

In [6]:
bike_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [7]:
bike_data.isna().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [8]:
bike_data.windspeed.sum()

139334.2184

In [9]:
bike_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [10]:
bike_data['year'] = bike_data['datetime'].apply(lambda y : y[:4])

In [11]:
bike_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011


In [12]:
bike_data['month'] = bike_data['datetime'].apply(lambda y : y[5:7])

In [13]:
bike_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1


In [14]:
bike_data["day"] = bike_data["datetime"].apply(lambda x : x[8:10])

In [15]:
bike_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1


In [16]:
bike_data["hour"] = bike_data["datetime"].apply(lambda x : x[11:13])

In [17]:
bike_data['windspeed'].replace({0.00:bike_data['windspeed'].sum() / ((bike_data.shape[0])-1313)},inplace=True)

In [18]:
bike_data.windspeed.value_counts()

14.554917    1313
8.998100     1120
11.001400    1057
12.998000    1042
7.001500     1034
15.001300     961
6.003200      872
16.997900     824
19.001200     676
19.999500     492
22.002800     372
23.999400     274
26.002700     235
27.999300     187
30.002600     111
31.000900      89
32.997500      80
35.000800      58
39.000700      27
36.997400      22
43.000600      12
40.997300      11
43.998900       8
46.002200       3
56.996900       2
47.998800       2
50.002100       1
51.998700       1
Name: windspeed, dtype: int64

In [19]:
del bike_data['datetime']

In [20]:
bike_data.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour
0,1,0,0,1,9.84,14.395,81,14.554917,3,13,16,2011,1,1,0
1,1,0,0,1,9.02,13.635,80,14.554917,8,32,40,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,14.554917,5,27,32,2011,1,1,2
3,1,0,0,1,9.84,14.395,75,14.554917,3,10,13,2011,1,1,3
4,1,0,0,1,9.84,14.395,75,14.554917,0,1,1,2011,1,1,4


In [21]:
bike_data.isna().sum() 

season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
year          0
month         0
day           0
hour          0
dtype: int64

In [22]:
bike_data.duplicated().sum()

0

In [23]:
bike_data.workingday.unique()

array([0, 1])

In [24]:
bike_data.holiday.unique()

array([0, 1])

In [25]:
bike_data['count'].value_counts()

5      169
4      149
3      144
6      135
2      132
      ... 
667      1
603      1
587      1
970      1
843      1
Name: count, Length: 822, dtype: int64

In [26]:
bike_data.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour
0,1,0,0,1,9.84,14.395,81,14.554917,3,13,16,2011,1,1,0
1,1,0,0,1,9.02,13.635,80,14.554917,8,32,40,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,14.554917,5,27,32,2011,1,1,2
3,1,0,0,1,9.84,14.395,75,14.554917,3,10,13,2011,1,1,3
4,1,0,0,1,9.84,14.395,75,14.554917,0,1,1,2011,1,1,4


In [27]:
bike_data['workingday'].unique()

array([0, 1])

In [28]:
bike_data.windspeed.unique()

array([14.55491679,  6.0032    , 16.9979    , 19.0012    , 19.9995    ,
       12.998     , 15.0013    ,  8.9981    , 11.0014    , 22.0028    ,
       30.0026    , 23.9994    , 27.9993    , 26.0027    ,  7.0015    ,
       32.9975    , 36.9974    , 31.0009    , 35.0008    , 39.0007    ,
       43.9989    , 40.9973    , 51.9987    , 46.0022    , 50.0021    ,
       43.0006    , 56.9969    , 47.9988    ])

In [39]:
bike_data.isna().sum()

season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
year          0
month         0
day           0
hour          0
dtype: int64

In [31]:
bike_data.shape

(10886, 15)

In [37]:
df_d = bike_data

In [40]:
X = df_d[df_d.columns[~df_d.columns.isin(['count'])]]

In [41]:
X.shape

(10886, 14)

In [34]:
Y = bike_data.iloc[:,-5:-4]

In [35]:
Y.shape

(10886, 1)

In [45]:
train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size=0.25,random_state=30)

In [46]:
train_x.shape

(8164, 14)

In [47]:
test_x.shape

(2722, 14)

In [32]:
linear = LinearRegression()

In [48]:
linear.fit(train_x,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [53]:
linear.coef_

array([[-1.88490364e-14, -5.41788836e-14,  7.90072218e-14,
        -1.27546086e-14,  2.40085729e-15,  1.63757896e-15,
         8.32667268e-17,  4.85722573e-16,  1.00000000e+00,
         1.00000000e+00, -2.48226055e-15,  9.71445147e-16,
         1.67400815e-16,  5.15212872e-16]])

In [55]:
lin_train_predict = linear.predict(train_x)

In [56]:
linear_test_predict = linear.predict(test_x)

In [57]:
np.sqrt(mean_squared_error(lin_train_predict,train_y))

9.375443779011675e-13

In [58]:
np.sqrt(mean_squared_error(linear_test_predict,test_y))

9.463399217668e-13

In [59]:
X.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,year,month,day,hour
0,1,0,0,1,9.84,14.395,81,14.554917,3,13,2011,1,1,0
1,1,0,0,1,9.02,13.635,80,14.554917,8,32,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,14.554917,5,27,2011,1,1,2
3,1,0,0,1,9.84,14.395,75,14.554917,3,10,2011,1,1,3
4,1,0,0,1,9.84,14.395,75,14.554917,0,1,2011,1,1,4


In [60]:
X = df_d[df_d.columns[~df_d.columns.isin(['count','casual','registered','year'])]]

In [62]:
X.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,day,hour
0,1,0,0,1,9.84,14.395,81,14.554917,1,1,0
1,1,0,0,1,9.02,13.635,80,14.554917,1,1,1
2,1,0,0,1,9.02,13.635,80,14.554917,1,1,2
3,1,0,0,1,9.84,14.395,75,14.554917,1,1,3
4,1,0,0,1,9.84,14.395,75,14.554917,1,1,4


In [63]:
train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size=0.30,random_state=25)

In [64]:
linear = LinearRegression()

In [65]:
linear.fit(train_x,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [78]:
linear.coef_

array([[-15.69575662,  -2.22724887,  -4.24954426,  -1.95383096,
          2.15692906,   4.38761392,  -2.25334205,   0.1332569 ,
         12.10804046,   0.37785803,   7.63892015]])

In [80]:
train_x.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'month', 'day', 'hour'],
      dtype='object')

In [69]:
lin_train_predict = linear.predict(train_x)

In [70]:
lin_test_predict = linear.predict(test_x)

In [71]:
np.sqrt(mean_squared_error(lin_train_predict,train_y))

145.4548250934682

In [74]:
np.sqrt(mean_squared_error(lin_test_predict,test_y))

152.0325416286868

In [83]:
def model_fit_poly(model, train_train_X, train_test_X, train_train_y, train_test_y, 
              train_X, train_y, test_X, filename ):
    
    model.fit(train_train_X,train_train_y)

    #pd.DataFrame(model.coef_,train_train_X.columns).plot(kind = "bar")

    train_predict = model.predict(train_train_X)

    test_predict = model.predict(train_test_X)

    print("Train RMSE : ",np.sqrt(mean_squared_error(train_predict,train_train_y)))

    print("Test RMSE : ",np.sqrt(mean_squared_error(test_predict,train_test_y)))
    
#     model.fit(train_X, train_y)
    
#     test_hack_predict = model.predict(test_X)
#     test_pred_df = pd.DataFrame(test_hack_predict,columns= ["Item_Outlet_Sales"])
#     result = pd.concat([test_preprocessed[["Item_Identifier", "Outlet_Identifier"]],test_pred_df] ,axis = 1)
#     result.to_csv(filename,index = False)

In [85]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
x_train_poly = poly_reg.fit_transform(train_x)
x_test_poly = poly_reg.fit_transform(test_x)

X_poly = poly_reg.fit_transform(X)

pol_reg = LinearRegression()

In [87]:
model_fit_poly(pol_reg,x_train_poly,x_test_poly,train_y,test_y,X_poly,Y,x_test_poly,"linear_poly.csv")

Train RMSE :  126.4616417492773
Test RMSE :  134.53872866079197


In [None]:
x