### Data Fields

- datetime - hourly date + timestamp  
- season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
- holiday - whether the day is considered a holiday
- workingday - whether the day is neither a weekend nor holiday
- weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
- temp - temperature in Celsius
- atemp - "feels like" temperature in Celsius
- humidity - relative humidity
- windspeed - wind speed
- casual - number of non-registered user rentals initiated
- registered - number of registered user rentals initiated
- count - number of total rentals

(ten years of data, first 20 days a month is train, rest is test)

In [1]:
import numpy as np
import scipy 
import pandas as pd
from sklearn import tree
import math

In [None]:
#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [2]:
def normalize(x):
    return (x - x.mean()) / (x.max() - x.min()) + 1 

# generate dummy numericals for the categorical data
def dummy(x, labels):
    y = pd.get_dummies(x)
    y.colums = labels
    y = y.astype(int)
    return y

In [3]:
def clean(dset):
    # extract useful data from the datetime column
    dset.datetime = pd.to_datetime(dset.datetime)
    dset["hour"] = dset.datetime.dt.hour
    dset["year"] = dset.datetime.dt.year
    # we dont need datetime anymore
    dset = dset.drop('datetime', 1)
    
    weather_labels = ["Clear", "Mist", "LightSnow", "HeavyRain"]
    season_labels =["spring", "summer", "fall", "winter"]
    
    weather = dummy(dset.weather, weather_labels)
    season = dummy(dset.season, season_labels)
    
    weather_season = weather.merge(season, left_index=True, right_index=True)
    dset = dset.merge(weather_season, left_index=True, right_index=True)
    
    dset = dset.drop('weather', 1)
    dset = dset.drop('season', 1)
  
    # normalize numerical data
    #dset.temp = normalize(dset.temp)
    #dset.atemp = normalize(dset.atemp)
    #dset.windspeed = normalize(dset.windspeed)
    #dset.humidity = normalize(dset.humidity)
    #dset.hour = normalize(dset.hour)
    #dset.year = normalize(dset.year)
    return dset
    

In [28]:
train_t = pd.read_csv("data/original/train.csv")
test_t = pd.read_csv("data/original/test.csv")

In [29]:
np.shape(train_t)

(10886, 12)

In [30]:
train_c = clean(train_t)
test_c = clean(test_t)

# drop for now, they dont seem to help
train_c = train_c.drop('casual', 1)
train_c = train_c.drop('registered', 1)

train_labels = train_c["count"]
#train = train.drop('count', 1)

In [31]:
# backup
train_c.to_csv("data/clean/train.csv",index=False,header=False,float_format='%.3f')
test_c.to_csv("data/clean/test.csv",index=False,header=False,float_format='%.3f')


In [32]:
# convert pandas df to numpy matrix
train_m = train_c.as_matrix()
train_labels_m = train_labels.as_matrix()
test = test_c.as_matrix()

In [42]:
# split data into train and validation
np.random.shuffle(train_m)
valid = train_m[:2000,:]
train = train_m[2000:,:]

In [55]:
train_lab = train[:,7]
valid_lab = valid[:,7]

train = np.delete(train,7,1)
valid = np.delete(valid,7,1)


In [56]:
# first try - decision tree
clf = tree.DecisionTreeClassifier()
clf.fit(train, train_lab)
predicts = clf.predict(valid)

In [57]:
rmsle(valid_lab,predicts)
# rmsle on validation set:
# 0.7428518021853199

0.7428518021853199