In [119]:
# General libraries.
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from datetime import datetime
import calendar

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import linear_model

In [197]:
# Note -- I experimented with both np arrays and numpy. I ultimately went with pandas, but left in the
# np generatin in case anyone else wanted to play around

train_csv = "/Users/petergrabowski/Desktop/train.csv" # you will need to edit this directory
train_types = (datetime, int, bool, bool, int, float, float, int, float, int, int, int)
train = np.genfromtxt(train_csv,delimiter=',',dtype=train_types, names=True)

train_pd = pd.read_csv(train_csv)

test_csv = "/Users/petergrabowski/Desktop/test.csv" # you will need to edit this directory
test_types = (datetime, int, bool, bool, int, float, float, int, float)
test = np.genfromtxt(test_csv,delimiter=',',dtype=test_types,names=True)
test_pd = pd.read_csv(test_csv)

In [198]:
print train.dtype.names
print train[0]
print
print test.dtype.names
print test[0]

('datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count')
('2011-01-01 00:00:00', 1, False, False, 1, 9.84, 14.395, 81, 0.0, 3, 13, 16)

('datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed')
('2011-01-20 00:00:00', 1, False, False, 1, 10.66, 11.365, 56, 26.0027)


In [199]:
# feature engineering

day_map = {0: 'Monday', 
           1: 'Tuesday', 
           2 : 'Wednesday', 
           3 : 'Thursday', 
           4 : 'Friday', 
           5 : "Saturday", 
           6 : "Sunday"}

month_map = {1: "January",
             2 : "February",
             3 : "March",
             4 : "April",
             5 : "May",
             6 : "June",
             7 : "July",
             8 : "August",
             9 : "September",
             10 : "October",
             11 : "November",
             12 : "December"}

season_map = {1: "Spring", 
              2 : "Summer", 
              3 : "Fall", 
              4 :"Winter" }

weather_map = {1: " Clear + Few clouds + Partly cloudy + Partly cloudy",
               2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", 
               3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", 
               4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " }

def getDayNameFromTimestamp(x):
    datestr =  str(x).split()[0]
    weekday_num = datetime.strptime(datestr,"%Y-%m-%d").weekday()
    return day_map[weekday_num]

def getDateFromTimestamp(x):
    return str(x).split()[0]
    
def getHourFromTimestamp(x):
    return str(x).split()[1].split(":")[0]

def getWeekdayNameFromTimestamp(x):
    datestr =  str(x).split()[0]
    day_num = datetime.strptime(datestr,"%Y-%m-%d").weekday()
    return day_map[day_num]

def getMonthNameFromTimestamp(x):
    datestr =  str(x).split()[0]
    month_num = datetime.strptime(datestr,"%Y-%m-%d").month
    return month_map[month_num]
    
    
train_pd['date'] = train_pd['datetime'].apply(lambda x: getDateFromTimestamp(x))
train_pd['hour'] = train_pd['datetime'].apply(lambda x: getHourFromTimestamp(x))
train_pd["weekday"] = train_pd['datetime'].apply(lambda x : getDayNameFromTimestamp(x))
train_pd["month"] =  train_pd['datetime'].apply(lambda x : getMonthNameFromTimestamp(x))
train_pd["season_str"] = train_pd['season'].map(season_map)
train_pd["weather_str"] = train_pd['weather'].map(weather_map)
train_pd["is_nice_weather"] = train_pd['weather'].apply(lambda x : x <= 2)

In [200]:
# more feature engineering -- make columns be categorical
categoricalColumns = ["hour",
                      "weekday",
                      "month",
                      "season", 
                      "season_str",
                      "weather", 
                      "weather_str",
                      "holiday",
                      "workingday",
                      "is_nice_weather"]

for column in categoricalColumns:
    train_pd[column] = train_pd[column].astype("category")  

In [None]:
# TODO -- detect missing data

In [None]:
# TODO -- assess distribution/skewness

In [None]:
# TODO -- visualize counts over time by hour of day, day of week, month, season

In [None]:
# TODO -- model predictions, iterations, and improvements below

In [196]:
# initial dummy guess
import csv

def writeToCSV(file_name, header=None):
    with open(file_name,'w') as f:
        writer=csv.writer(f)
        if header:
            writer.writerow(header) # write header row

        for item in submission:
            writer.writerow(item)

lm = linear_model.LinearRegression()

train_temps = train['temp'].reshape(-1, 1)
train_counts = train['count'].reshape(-1, 1)

lm.fit(train_temps, train_counts)

test_temps = test['temp'].reshape(-1, 1)
test_dates = test['datetime']

preds = lm.predict(test_temps)

submission = zip(test_dates, ( x[0] for x in preds))

file_name = '/Users/petergrabowski/Desktop/preds.csv'
header = ["datetime", "count"]

writeToCSV(file_name, header)

In [None]:
# TODO -- other regressor

In [None]:
# TODO -- other regressor

In [None]:
# TODO -- random forest regressor