### Data Fields

- datetime - hourly date + timestamp  
- season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
- holiday - whether the day is considered a holiday
- workingday - whether the day is neither a weekend nor holiday
- weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
- temp - temperature in Celsius
- atemp - "feels like" temperature in Celsius
- humidity - relative humidity
- windspeed - wind speed
- casual - number of non-registered user rentals initiated
- registered - number of registered user rentals initiated
- count - number of total rentals

(ten years of data, first 20 days a month is train, rest is test)

In [2]:
import numpy as np
import pandas as pd

# generate dummy numericals for the categorical data
def dummy(x, labels):
    y = pd.get_dummies(x)
    y.colums = labels
    y = y.astype(int)
    return y

def preprocess(filepath):
    # extract useful data from the datetime column
    dset = pd.read_csv("data/original/train.csv")
    
    elems = filepath.split("/")
    filename = elems[-1]  
    
    dset.datetime = pd.to_datetime(dset.datetime)
    dset["hour"] = dset.datetime.dt.hour
    dset["year"] = dset.datetime.dt.year
    # we dont need datetime anymore
    dset = dset.drop('datetime', 1)
    
    weather_labels = ["Clear", "Mist", "LightSnow", "HeavyRain"]
    season_labels =["spring", "summer", "fall", "winter"]
    
    weather = dummy(dset.weather, weather_labels)
    season = dummy(dset.season, season_labels)
    
    weather_season = weather.merge(season, left_index=True, right_index=True)
    dset = dset.merge(weather_season, left_index=True, right_index=True)
    
    dset = dset.drop('weather', 1)
    dset = dset.drop('season', 1)
    
    #drop atemp because its too close to temp
    dset = dset.drop('atemp', 1)
    
    # drop casual, and registered since they are not in the test set anyways
    dset = dset.drop('casual', 1)
    dset = dset.drop('registered', 1)

    # save the cleaned up data
    dset.to_csv("data/clean/%s" % filename,index=False,float_format='%.3f')
    
    ## not needed per se, but useful for last data preview
    return dset

train = preprocess("data/original/train.csv")
test = preprocess("data/original/test.csv")

In [3]:
train

Unnamed: 0,holiday,workingday,temp,humidity,windspeed,count,year,1_x,2_x,3_x,4_x,1_y,2_y,3_y,4_y
0,0,0,9.84,81,0.0000,16,2011,1,0,0,0,1,0,0,0
1,0,0,9.02,80,0.0000,40,2011,1,0,0,0,1,0,0,0
2,0,0,9.02,80,0.0000,32,2011,1,0,0,0,1,0,0,0
3,0,0,9.84,75,0.0000,13,2011,1,0,0,0,1,0,0,0
4,0,0,9.84,75,0.0000,1,2011,1,0,0,0,1,0,0,0
5,0,0,9.84,75,6.0032,1,2011,0,1,0,0,1,0,0,0
6,0,0,9.02,80,0.0000,2,2011,1,0,0,0,1,0,0,0
7,0,0,8.20,86,0.0000,3,2011,1,0,0,0,1,0,0,0
8,0,0,9.84,75,0.0000,8,2011,1,0,0,0,1,0,0,0
9,0,0,13.12,76,0.0000,14,2011,1,0,0,0,1,0,0,0
