### Data Fields

- datetime - hourly date + timestamp  
- season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
- holiday - whether the day is considered a holiday
- workingday - whether the day is neither a weekend nor holiday
- weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
- temp - temperature in Celsius
- atemp - "feels like" temperature in Celsius
- humidity - relative humidity
- windspeed - wind speed
- casual - number of non-registered user rentals initiated
- registered - number of registered user rentals initiated
- count - number of total rentals

(ten years of data, first 20 days a month is train, rest is test)

In [1]:
import numpy as np
import scipy 
import pandas as pd
from sklearn import tree, ensemble, linear_model, svm
import math

In [2]:
#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [3]:
def normalize(x):
    return (x - x.mean()) / (x.max() - x.min()) + 1 

# generate dummy numericals for the categorical data
def dummy(x, labels):
    y = pd.get_dummies(x)
    y.colums = labels
    y = y.astype(int)
    return y

In [4]:
def clean(dset):
    # extract useful data from the datetime column
    dset.datetime = pd.to_datetime(dset.datetime)
    dset["hour"] = dset.datetime.dt.hour
    dset["year"] = dset.datetime.dt.year
    # we dont need datetime anymore
    dset = dset.drop('datetime', 1)
    
    weather_labels = ["Clear", "Mist", "LightSnow", "HeavyRain"]
    season_labels =["spring", "summer", "fall", "winter"]
    
    weather = dummy(dset.weather, weather_labels)
    season = dummy(dset.season, season_labels)
    
    weather_season = weather.merge(season, left_index=True, right_index=True)
    dset = dset.merge(weather_season, left_index=True, right_index=True)
    
    dset = dset.drop('weather', 1)
    dset = dset.drop('season', 1)
    
    #drop atemp because its too close to temp
    dset = dset.drop('atemp', 1)
  
    # normalize numerical data
    #dset.temp = normalize(dset.temp)
    #dset.atemp = normalize(dset.atemp)
    #dset.windspeed = normalize(dset.windspeed)
    #dset.humidity = normalize(dset.humidity)
    #dset.hour = normalize(dset.hour)
    #dset.year = normalize(dset.year)
    return dset
    

In [5]:
train_t = pd.read_csv("data/original/train.csv")
test_t = pd.read_csv("data/original/test.csv")

In [6]:
np.shape(train_t)

(10886, 12)

In [7]:
train_c = clean(train_t)
test_c = clean(test_t)

# drop for now, they dont seem to help
train_c = train_c.drop('casual', 1)
train_c = train_c.drop('registered', 1)

train_labels = train_c["count"]
#train = train.drop('count', 1)

In [113]:
train_c

Unnamed: 0,holiday,workingday,temp,humidity,windspeed,count,hour,year,1_x,2_x,3_x,4_x,1_y,2_y,3_y,4_y
0,0,0,9.84,81,0.0000,16,0,2011,1,0,0,0,1,0,0,0
1,0,0,9.02,80,0.0000,40,1,2011,1,0,0,0,1,0,0,0
2,0,0,9.02,80,0.0000,32,2,2011,1,0,0,0,1,0,0,0
3,0,0,9.84,75,0.0000,13,3,2011,1,0,0,0,1,0,0,0
4,0,0,9.84,75,0.0000,1,4,2011,1,0,0,0,1,0,0,0
5,0,0,9.84,75,6.0032,1,5,2011,0,1,0,0,1,0,0,0
6,0,0,9.02,80,0.0000,2,6,2011,1,0,0,0,1,0,0,0
7,0,0,8.20,86,0.0000,3,7,2011,1,0,0,0,1,0,0,0
8,0,0,9.84,75,0.0000,8,8,2011,1,0,0,0,1,0,0,0
9,0,0,13.12,76,0.0000,14,9,2011,1,0,0,0,1,0,0,0


In [8]:
# backup
train_c.to_csv("data/clean/train.csv",index=False,header=False,float_format='%.3f')
test_c.to_csv("data/clean/test.csv",index=False,header=False,float_format='%.3f')


In [9]:
# convert pandas df to numpy matrix
train_m = train_c.as_matrix()
train_labels_m = train_labels.as_matrix()
test = test_c.as_matrix()

In [117]:
train_m[1,:]

array([  0.00000000e+00,   0.00000000e+00,   9.02000000e+00,
         8.00000000e+01,   0.00000000e+00,   4.00000000e+01,
         1.00000000e+00,   2.01100000e+03,   1.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00])

In [10]:
# split data into train and validation
np.random.shuffle(train_m)
valid = train_m[:2000,:]
train = train_m[2000:,:]

In [11]:
train_lab = train[:,6]
valid_lab = valid[:,6]

train = np.delete(train,6,1)
valid = np.delete(valid,6,1)


In [12]:
# first try - decision tree
clf = tree.DecisionTreeRegressor()
clf.fit(train, train_lab)
predicts = clf.predict(valid)

In [124]:
np.shape(train)

(8886, 15)

In [13]:
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("plew.pdf")

KeyboardInterrupt: 

In [127]:
Image(graph.create_png()) 


KeyboardInterrupt: 

In [14]:
rmsle(valid_lab,predicts)
# rmsle on validation set:
# 0.7428518021853199

0.7107283333028062

In [78]:
r_forest = ensemble.RandomForestClassifier()
clf.fit(train, train_lab)
predicts_for = clf.predict(valid)

In [79]:
rmsle(valid_lab,predicts_for)
# rmsle on validation set:
# 0.7428518021853199

0.7493429008475483

In [85]:
regression = linear_model.LinearRegression()
regression.fit(train,train_lab)
reg_pred = regression.predict(valid)
rmsle(valid_lab, reg_pred)

0.7273298632149857

In [87]:
svr = svm.SVR()
svr.fit(train,train_lab)
svr_pred = svr.predict(valid)
rmsle(valid_lab, svr_pred)

0.7489313958689289

In [15]:
predicts = clf.predict(test)


In [27]:
predicts
test_t.datetime

p = pd.DataFrame(predicts)
x = test_t.datetime.to_frame()

res = p.merge(x, left_index=True, right_index=True)
res.to_csv("data/clean/result.csv",index=False,header=["count","datetime"],float_format='%.3f')


In [28]:
res

Unnamed: 0,0,datetime
0,3.0,2011-01-20 00:00:00
1,4.0,2011-01-20 01:00:00
2,4.0,2011-01-20 02:00:00
3,4.0,2011-01-20 03:00:00
4,3.0,2011-01-20 04:00:00
5,4.0,2011-01-20 05:00:00
6,0.0,2011-01-20 06:00:00
7,1.0,2011-01-20 07:00:00
8,1.0,2011-01-20 08:00:00
9,0.0,2011-01-20 09:00:00
