In [27]:
#Mary Makarious

# Importing necessary packages
import pandas
import numpy
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn import cross_validation
from datetime import datetime


#Reading training data
train_set = pandas.read_csv("train.csv")
#Reading testing data
test_set = pandas.read_csv("test.csv")

#Printing data read
print("Training data: \n", train_set.head(), "\n")
print("Testing data: \n", test_set.head(), "\n")

Training data: 
               datetime  season  holiday  workingday  weather  temp   atemp  \
0  2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1  2011-01-01 01:00:00       1        0           0        1  9.02  13.635   
2  2011-01-01 02:00:00       1        0           0        1  9.02  13.635   
3  2011-01-01 03:00:00       1        0           0        1  9.84  14.395   
4  2011-01-01 04:00:00       1        0           0        1  9.84  14.395   

   humidity  windspeed  casual  registered  count  
0        81          0       3          13     16  
1        80          0       8          32     40  
2        80          0       5          27     32  
3        75          0       3          10     13  
4        75          0       0           1      1   

Testing data: 
               datetime  season  holiday  workingday  weather   temp   atemp  \
0  2011-01-20 00:00:00       1        0           1        1  10.66  11.365   
1  2011-01-20 01:00:00    

In [43]:
#Get month of formatted string
def get_month(date_string):
    #Documentation:
    #https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
    return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").date().month

#Get hour of formatted string
def get_hour(date_string):
    #Documentation:
    #https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
    return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").hour

#Adding a new feature by applying the function get_month to each
#datetime entry of the .csv file
train_set['month'] = train_set['datetime'].map(get_month)
test_set['month'] = test_set['datetime'].map(get_month)

#Adding a new feature by applying the function get_hour to each
#datetime entry of the .csv file
train_set['hour'] = train_set['datetime'].map(get_hour)
test_set['hour'] = test_set['datetime'].map(get_hour)


#Printing new data
print("Training data: \n", train_set.head(), "\n")
print("Testing data: \n", test_set.head(), "\n")

Training data: 
               datetime  season  holiday  workingday  weather  temp   atemp  \
0  2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1  2011-01-01 01:00:00       1        0           0        1  9.02  13.635   
2  2011-01-01 02:00:00       1        0           0        1  9.02  13.635   
3  2011-01-01 03:00:00       1        0           0        1  9.84  14.395   
4  2011-01-01 04:00:00       1        0           0        1  9.84  14.395   

   humidity  windspeed  casual  registered  count  month  hour  minute  
0        81          0       3          13     16      1     0       0  
1        80          0       8          32     40      1     1       0  
2        80          0       5          27     32      1     2       0  
3        75          0       3          10     13      1     3       0  
4        75          0       0           1      1      1     4       0   

Testing data: 
               datetime  season  holiday  workingday  weath

In [53]:
def cross_val(model, features, count):
    #Creating KFold cross validation object. Divides data in n_folds groups, learns with (n_folds - 1) and applies to the 
    #remaining one
    #Documentation:
    #http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html
    kf = cross_validation.KFold(n = len(features), n_folds = 15, indices = None, shuffle = True, random_state = 5)
    #Applies 
    #Documentation:
    #http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html
    scoreArray = cross_validation.cross_val_score(model, features, count, scoring='mean_squared_error', cv = kf, n_jobs = 1)
    #Get the mean of the scores
    score = numpy.mean(scoreArray)
    #Return the Root of the absolute value of the score
    return math.sqrt(math.fabs(score))

#Preparing training data
#These are the features that will be used to train the classifier
#After adding new features, we will have to manually add their name to this list
features = ['season','holiday','workingday','weather','temp','atemp','humidity','windspeed', 'month','hour','casual','registered']
#Getting the desired features from the training set
features_train = train_set[features]
#Getting the labels of the data contained in the training set
count_train = train_set['count']

#Feeding classifier with training data
#Can be changed to other classifiers, such as LogisticRegression,
#KNearestNeighbors etc.

#Try changing it! The output may take a while to appear
#model = KNeighborsClassifier(n_neighbors=10, weights = 'uniform')
#model = DecisionTreeClassifier(random_state = 0)
#model = DecisionTreeRegressor(random_state=0)
model = RandomForestRegressor(n_estimators = 50)
#model = RandomForestClassifier(n_estimators = 30)
model.fit(features_train, count_train)

#Running cross validation to evaluate performance of our model
#Smaller values are desired
error = cross_val(model, features_train, count_train)

#Printing Root Mean Squared Error
print("Root Mean Squared error: ", error)

Root Mean Squared error:  2.936593179123362


In [35]:
#Now we will feed the classifier with the test data, so that we can
#classify it
#Getting desired features from the testing set
features_test = test_set[features]

#Predicting it
predicted = model.predict(features_test)

#Printing predicted values
print("Predicted: \n", predicted)

Predicted: 
 [ 237.8          58.11666667   58.11666667 ...,  175.86666667  113.06
   83.16111111]


In [36]:
#Creating new DataFrame that will be used to write the results on a .csv
#file
#One column named 'count' that has the predicted list as its rows
data = {'count' : predicted}
#DataFrame will have the datetimes as the index of the predicted values
data_frame = pandas.DataFrame(data = data, index = test_set['datetime'])

#Writing to .csv file
data_frame.to_csv("bike_sharing.csv")