In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectFromModel
%matplotlib inline

In [59]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.dtypes

ID                                int64
Park_ID                           int64
Date                             object
Direction_Of_Wind               float64
Average_Breeze_Speed            float64
Max_Breeze_Speed                float64
Min_Breeze_Speed                float64
Var1                            float64
Average_Atmospheric_Pressure    float64
Max_Atmospheric_Pressure        float64
Min_Atmospheric_Pressure        float64
Min_Ambient_Pollution           float64
Max_Ambient_Pollution           float64
Average_Moisture_In_Park        float64
Max_Moisture_In_Park            float64
Min_Moisture_In_Park            float64
Location_Type                     int64
Footfall                          int64
dtype: object




In [60]:
all_data = pd.concat((train.loc[:, 'Park_ID':'Location_Type'],
                      test.loc[:, 'Park_ID':'Location_Type']))

In [61]:
all_data['Day'] = all_data.Date.str.split('-').str[0].astype(int)
all_data['Month'] = all_data.Date.str.split('-').str[1].astype(int)
all_data['Year'] = all_data.Date.str.split('-').str[2].astype(int)
all_data = all_data.drop('Date', axis = 1)

train['Day'] = train.Date.str.split('-').str[0].astype(int)
train['Month'] = train.Date.str.split('-').str[1].astype(int)
train['Year'] = train.Date.str.split('-').str[2].astype(int)
train = train.drop('Date', axis = 1)


'''
for i, j in enumerate(li):
    li[i]= int("".join(reversed(j)))
all_data['Date'] = pd.Series(li)

li = train.Date.str.split('-').tolist()
for i, j in enumerate(li):
    li[i]= int("".join(reversed(j)))
train['Date'] = pd.Series(li)
'''
all_data.describe()

Unnamed: 0,Park_ID,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,Max_Ambient_Pollution,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,Location_Type,Day,Month,Year
count,153959.0,148535.0,148535.0,148530.0,148532.0,142757.0,100591.0,100591.0,100591.0,112659.0,112659.0,153880.0,153880.0,153880.0,153959.0,153959.0,153959.0,153959.0
mean,25.623088,179.510661,34.295282,51.77507,17.306102,18.680243,8331.78675,8356.396705,8305.855196,165.885442,307.784127,247.52511,284.184124,201.367338,2.620958,15.727486,6.608721,1997.766633
std,8.100157,85.107906,17.479606,22.150405,14.450117,37.983644,79.980197,75.206795,86.042255,90.865036,36.867967,28.412464,15.588646,45.533684,0.963888,8.798959,3.462217,4.420105
min,12.0,1.0,3.04,7.6,0.0,0.0,7982.0,8006.0,7890.0,4.0,8.0,90.0,129.0,48.0,1.0,1.0,1.0,1990.0
25%,18.0,,,,,,,,,,,,,,2.0,8.0,4.0,1994.0
50%,26.0,,,,,,,,,,,,,,3.0,16.0,7.0,1998.0
75%,33.0,,,,,,,,,,,,,,3.0,23.0,10.0,2002.0
max,39.0,360.0,154.28,220.4,129.2,1181.09,8588.0,8601.0,8571.0,348.0,356.0,300.0,300.0,300.0,4.0,31.0,12.0,2005.0


In [66]:
for c in train.columns:
    print c,':',skew(train[c].dropna())

ID : 0.0137381300174
Park_ID : -0.0230211623087
Direction_Of_Wind : -0.3388085786
Average_Breeze_Speed : 1.29359512201
Max_Breeze_Speed : 1.10897260967
Min_Breeze_Speed : 1.55376410602
Var1 : 4.33791017285
Average_Atmospheric_Pressure : -0.28550255553
Max_Atmospheric_Pressure : -0.209862019156
Min_Atmospheric_Pressure : -0.344110614821
Min_Ambient_Pollution : -0.29505495956
Max_Ambient_Pollution : -1.8838785083
Average_Moisture_In_Park : -0.801725261022
Max_Moisture_In_Park : -2.07353862117
Min_Moisture_In_Park : -0.34311366811
Location_Type : -0.165165356354
Footfall : -0.195627064184
Day : 0.00673207126918
Month : -0.0552472945254
Year : -0.00533454546831


In [67]:
# For right skewed data
skewed_feats = train.apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.8]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# For left skewed data
skewed_feats_left = train.apply(lambda x: skew(x.dropna()))
skewed_feats_left = skewed_feats_left[skewed_feats_left < -0.8]
skewed_feats_left = skewed_feats_left.index

all_data[skewed_feats_left] = np.power(all_data[skewed_feats_left], 3)

In [68]:
all_data = all_data.fillna(all_data.median())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.Footfall

In [71]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y,
                   scoring="mean_squared_error", cv=5))
    return rmse

In [72]:
from sklearn.linear_model import LinearRegression, Ridge
print rmse_cv(LassoCV(alphas=[1, 0.1, 0.001, 0.0005], tol = 0.0005))
print rmse_cv(LinearRegression())
print rmse_cv(Ridge(10))

[ 204.5385512   200.91258685  202.54268193  202.35400445  199.61264418]
[ 204.53867791  200.91264532  202.36137507  202.08913721  199.36928278]
[ 204.5366634   200.91176626  202.36289577  202.09009177  199.36993901]


In [74]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
print rmse_cv(GradientBoostingRegressor())

[  92.74486961   98.69241277  101.7008459   104.52038268  105.85062503]


In [21]:
model = LassoCV(alphas=[1, 0.1, 0.001, 0.0005])
model.fit(X_train.drop['Max_Breeze_Speed', 'Min_Breeze_Speed', 'Max_Atmospheric_Pressure', 'Min_Atmospheric_Pressure', 'Max_Moisture_In_Park', 'Min_Moisture_In_Park'], y)
preds = model.predict(X_test.drop['Max_Breeze_Speed', 'Min_Breeze_Speed', 'Max_Atmospheric_Pressure', 'Min_Atmospheric_Pressure', 'Max_Moisture_In_Park', 'Min_Moisture_In_Park'])
solution = pd.DataFrame({"ID": test.ID, "Footfall": preds})
solution.to_csv("Try2.csv", index=False)

In [22]:
model.coef_

array([ -1.71548036e+00,  -3.19198764e-04,   3.26111571e-01,
        -1.78835384e+01,  -0.00000000e+00,   2.38118879e+00,
         1.84701714e+01,  -8.20287457e-02,  -1.77138793e+00,
         1.42367539e+00,   3.74999867e-02,   1.71055573e-06,
        -1.98599273e-05,   1.54304831e-05,  -9.14551001e-01,
        -4.04628492e+00])

In [23]:
X_train.columns

Index([u'Park_ID', u'Date', u'Direction_Of_Wind', u'Average_Breeze_Speed',
       u'Max_Breeze_Speed', u'Min_Breeze_Speed', u'Var1',
       u'Average_Atmospheric_Pressure', u'Max_Atmospheric_Pressure',
       u'Min_Atmospheric_Pressure', u'Min_Ambient_Pollution',
       u'Max_Ambient_Pollution', u'Average_Moisture_In_Park',
       u'Max_Moisture_In_Park', u'Min_Moisture_In_Park', u'Location_Type'],
      dtype='object')

In [24]:
for c in train.columns:
    print c,':',skew(train[c].dropna())

ID : 0.0137381300174
Park_ID : -0.0230211623087
Date : -0.00301648880143
Direction_Of_Wind : -0.3388085786
Average_Breeze_Speed : 1.29359512201
Max_Breeze_Speed : 1.10897260967
Min_Breeze_Speed : 1.55376410602
Var1 : 4.33791017285
Average_Atmospheric_Pressure : -0.28550255553
Max_Atmospheric_Pressure : -0.209862019156
Min_Atmospheric_Pressure : -0.344110614821
Min_Ambient_Pollution : -0.29505495956
Max_Ambient_Pollution : -1.8838785083
Average_Moisture_In_Park : -0.801725261022
Max_Moisture_In_Park : -2.07353862117
Min_Moisture_In_Park : -0.34311366811
Location_Type : -0.165165356354
Footfall : -0.195627064184


In [27]:
skew(train.Average_Breeze_Speed[train.Average_Breeze_Speed < np.log1p(90)])

0.9288204177297155

In [43]:
train.corr()

Unnamed: 0,ID,Park_ID,Date,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,Max_Ambient_Pollution,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,Location_Type,Footfall
ID,1.0,0.010789,0.152488,0.001035,-0.017661,-0.020491,-0.017533,-0.013773,0.010534,0.000171,0.019438,0.029344,-0.009936,-0.046056,0.007321,-0.049499,-0.018461,0.006953
Park_ID,0.010789,1.0,0.046481,-0.007235,-0.242771,-0.232213,-0.190381,-0.009543,0.013312,0.014137,0.015264,-0.029079,0.038066,-0.051564,0.080039,-0.114482,0.010486,0.006496
Date,0.152488,0.046481,1.0,0.020965,0.004692,0.008327,0.00207,-0.014855,0.032995,0.036398,0.029917,0.03359,-0.012311,-0.018838,-0.019301,-0.020637,-0.015647,0.022375
Direction_Of_Wind,0.001035,-0.007235,0.020965,1.0,0.119363,0.167734,0.036905,0.143466,-0.109152,-0.092698,-0.128751,0.042805,0.125886,0.133733,0.113443,0.110614,-0.001954,0.098339
Average_Breeze_Speed,-0.017661,-0.242771,0.004692,0.119363,1.0,0.931235,0.862712,0.232106,-0.323561,-0.272376,-0.357527,0.30323,0.047205,0.000602,-0.297054,0.176572,-0.067796,-0.076245
Max_Breeze_Speed,-0.020491,-0.232213,0.008327,0.167734,0.931235,1.0,0.706611,0.27198,-0.345207,-0.286496,-0.389323,0.272069,0.109865,-0.012528,-0.239845,0.122878,-0.072095,-0.035737
Min_Breeze_Speed,-0.017533,-0.190381,0.00207,0.036905,0.862712,0.706611,1.0,0.14863,-0.243886,-0.212342,-0.260759,0.269533,-0.044604,0.038293,-0.304091,0.215301,-0.044469,-0.115359
Var1,-0.013773,-0.009543,-0.014855,0.143466,0.232106,0.27198,0.14863,1.0,-0.369468,-0.326227,-0.382773,-0.077123,-0.002905,0.235081,0.142728,0.203766,-0.002993,0.060239
Average_Atmospheric_Pressure,0.010534,0.013312,0.032995,-0.109152,-0.323561,-0.345207,-0.243886,-0.369468,1.0,0.974414,0.975362,-0.036265,-0.050144,-0.155738,-0.05584,-0.171204,0.00181,-0.056244
Max_Atmospheric_Pressure,0.000171,0.014137,0.036398,-0.092698,-0.272376,-0.286496,-0.212342,-0.326227,0.974414,1.0,0.913611,-0.042163,-0.057577,-0.124458,-0.042621,-0.137534,0.002499,-0.1098


In [52]:
pd.read_csv('train.csv').Date.str.split('-').str[0]

0         01
1         02
2         03
3         04
4         05
5         06
6         07
7         08
8         09
9         10
10        11
11        12
12        13
13        14
14        15
15        16
16        17
17        18
18        19
19        20
20        21
21        22
22        23
23        24
24        25
25        26
26        27
27        28
28        29
29        30
          ..
114509    02
114510    03
114511    04
114512    05
114513    06
114514    07
114515    08
114516    09
114517    10
114518    11
114519    12
114520    13
114521    14
114522    15
114523    16
114524    17
114525    18
114526    19
114527    20
114528    21
114529    22
114530    23
114531    24
114532    25
114533    26
114534    27
114535    28
114536    29
114537    30
114538    31
Name: Date, dtype: object