### Data Fields

- datetime - hourly date + timestamp  
- season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
- holiday - whether the day is considered a holiday
- workingday - whether the day is neither a weekend nor holiday
- weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
- temp - temperature in Celsius
- atemp - "feels like" temperature in Celsius
- humidity - relative humidity
- windspeed - wind speed
- casual - number of non-registered user rentals initiated
- registered - number of registered user rentals initiated
- count - number of total rentals

(ten years of data, first 20 days a month is train, rest is test)

In [3]:
import numpy as np
import scipy 
import pandas as pd

In [8]:
def normalize(x):
    return (x - x.mean()) / (x.max() - x.min()) + 1 

# generate dummy numericals for the categorical data
def dummy(x, labels):
    y = pd.get_dummies(x)
    y.colums = labels
    y = y.astype(int)
    return y

In [31]:
def clean(dset):
    # extract useful data from the datetime column
    dset.datetime = pd.to_datetime(dset.datetime)
    dset["hour"] = dset.datetime.dt.hour
    dset["year"] = dset.datetime.dt.year
    # we dont need datetime anymore
    dset = dset.drop('datetime', 1)
    
    weather_labels = ["Clear", "Mist", "LightSnow", "HeavyRain"]
    season_labels =["spring", "summer", "fall", "winter"]
    
    weather = dummy(dset.weather, weather_labels)
    season = dummy(dset.season, season_labels)
    
    weather_season = weather.merge(season, left_index=True, right_index=True)
    dset = dset.merge(weather_season, left_index=True, right_index=True)
    
    dset = dset.drop('weather', 1)
    dset = dset.drop('season', 1)

    # normalize numerical data
    dset.temp = normalize(dset.temp)
    dset.atemp = normalize(dset.atemp)
    dset.windspeed = normalize(dset.windspeed)
    dset.humidity = normalize(dset.humidity)
    dset.hour = normalize(dset.hour)
    dset.year = normalize(dset.year)
    return dset
    

In [15]:
train_t = pd.read_csv("data/original/train.csv")
test_t = pd.read_csv("data/original/test.csv")

In [21]:
np.shape(train_t)
np.shape(test_t)

(6493, 9)

In [32]:
train = clean(train_t)
test = clean(test_t)

In [34]:
# backup
train.to_csv("data/clean/train.csv")
test.to_csv("data/clean/test.csv")
