In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors
import matplotlib.pyplot as ptl
from pandas.tseries.holiday import USFederalHolidayCalendar as calendarUSFH


# Cargar set de datos

In [2]:
trip = pd.read_csv('data/trip_train.csv')

In [3]:
weather = pd.read_csv('data/weather.csv')

In [4]:
station = pd.read_csv('data/station.csv')

# Modificando set de datos

In [5]:
stationReduced = station.loc[:,['id','name','city']]

def choseZipCode(city):
    if city == 'San Francisco' : return 94107
    elif city == 'San Jose' : return 95113
    elif city == 'Redwood City' : return 94063
    elif city == 'Palo Alto' : return 94301
    elif city == 'Mountain View' : return 94041

stationReduced['zip_code'] = stationReduced.loc[:,'city'].apply(choseZipCode)

stationReduced = stationReduced.drop(labels=['id','city'],axis=1)
stationReduced.rename(columns={'name':'station'},inplace=True)
trip.rename(columns={'start_station_name':'station'},inplace=True)

In [6]:
trip = trip.drop(labels=['zip_code','end_date','end_station_name','end_station_id','bike_id'],axis=1)

In [7]:
weather['precipitation_inches'] = weather['precipitation_inches'].apply(lambda x : 0.005 if (x == 'T') else float(x))

In [8]:

weather = weather[weather.precipitation_inches.isnull() == False]

weather = weather.fillna(0)

weather['events'] = weather['events'].apply(lambda x : 'Normal' if x == 0 else x)
weather['events'] = weather['events'].apply(lambda x : 'Rain' if x == 'rain' else x)
weather['date'] = weather['date'].apply(lambda date : pd.to_datetime(date).date())

In [9]:
def splitDateAndTime(dateAndTime):
    splitLine = dateAndTime.split(' ')
    return str(splitDate(splitLine[0]))  + splitTime(splitLine[1])

def splitDate(date):
    lineSplit = date.split('/')
    return str(lineSplit[2]) + str(lineSplit[0] if int(lineSplit[0]) >= 10 else '0' + str(lineSplit[0])) + str(lineSplit[1] if int(lineSplit[1]) >= 10 else '0' + str(lineSplit[1]) )

def splitTime(time):
    lineSplit = time.split(':')
    return str(lineSplit[0] if int(lineSplit[0]) >= 10 else '0' + str(lineSplit[0])) + str(lineSplit[1])


In [10]:
trip['time'] = trip['start_date'].apply(lambda x : int(splitDateAndTime(x)[8:12]) )

In [11]:
trip.loc[:,'start_date'] = trip['start_date'].apply(lambda x : pd.to_datetime(splitDateAndTime(x)))

In [12]:
trip.loc[:,'day_of_week'] = trip['start_date'].apply(lambda x : x.dayofweek )

In [13]:
trip.loc[:,'month'] = trip['start_date'].apply(lambda x : x.month )

In [14]:
trip.loc[:,'year'] = trip['start_date'].apply(lambda x : x.year )

In [15]:
trip.loc[:,'dayofyear'] = trip['start_date'].apply(lambda x : x.dayofyear )

In [16]:
trip.loc[:,'start_date'] = trip['start_date'].apply(lambda x : pd.to_datetime(x).date())

In [17]:
cal = calendarUSFH()
holidays = cal.holidays(return_name=True,start=pd.to_datetime('20130101'), end=pd.to_datetime('20151231'))
holiday_festive_day = pd.DataFrame(holidays,columns=['holiday']).reset_index()
holiday_festive_day.rename(columns={'index':'start_date'},inplace=True)

In [18]:
holiday_festive_day.loc[:,'start_date'] = holiday_festive_day['start_date'].apply(lambda x : pd.to_datetime(x).date())

In [19]:
trip = pd.merge(holiday_festive_day,trip,on=['start_date'],how='right')

In [20]:
trip.loc[:,'is_holiday'] = trip.holiday.isnull()

In [21]:
trip.loc[:,'is_holiday'] = trip['is_holiday'].apply(lambda x : not x )

In [22]:
trip.loc[:,'subscription_type'] = trip['subscription_type'].apply(lambda x : 0 if ('Subscriber' == x) else 1 )

In [23]:
trip.sample()

Unnamed: 0,start_date,holiday,id,duration,station,start_station_id,subscription_type,time,day_of_week,month,year,dayofyear,is_holiday
5331,2014-10-13,Columbus Day,495582,1323,San Francisco Caltrain 2 (330 Townsend),69,0,931,0,10,2014,286,True


In [24]:
tripWithZipCode = pd.merge(trip,stationReduced,on=['station'],how='right')

In [25]:

tripWithZipCode.sample()

Unnamed: 0,start_date,holiday,id,duration,station,start_station_id,subscription_type,time,day_of_week,month,year,dayofyear,is_holiday,zip_code
527589,2015-04-08,,717360,168,San Antonio Shopping Center,31,0,1801,2,4,2015,98,False,94041


In [26]:
tripWithZipCode = tripWithZipCode.drop(labels=['holiday'],axis=1).dropna()

In [27]:
tripWithZipCode.rename(columns={'start_date':'date'},inplace=True)

In [28]:
tripsWithWeather = pd.merge(tripWithZipCode,weather,on=['date','zip_code'],how='left')

In [29]:
tripsWithWeather.rename(columns={'start_station_id':'station_id'},inplace=True)

In [30]:
tripsWithWeather = tripsWithWeather.dropna()

In [31]:
tripsWithWeather = tripsWithWeather[tripsWithWeather['duration'] < 1000]

In [32]:
tripsWithWeather.shape

(469476, 35)

In [33]:
test_target = tripsWithWeather['duration'][-50000:]

In [34]:
train_target = tripsWithWeather['duration'][0:tripsWithWeather.shape[0]-50001]

In [35]:
tripsWithWeatherReduced = tripsWithWeather.drop(labels=['duration','date','station','events'],axis=1)

In [36]:
from sklearn.decomposition import IncrementalPCA

In [37]:
pca = IncrementalPCA(n_components=8)

In [38]:
tripTransform = pca.fit_transform(tripsWithWeatherReduced)

In [39]:
tripTransform.shape

(469476, 8)

In [40]:
tripTransformDF = pd.DataFrame(tripTransform)

In [41]:
tripTransformDF.shape

(469476, 8)

In [42]:
train = tripTransformDF.iloc[0:tripsWithWeather.shape[0]-50001]

In [43]:
test = tripTransformDF.iloc[-50000:]

In [44]:
train.shape

(419475, 8)

In [45]:
train_target.shape

(419475,)

# Machine Learning

In [46]:
from sklearn import neighbors

In [47]:
knn = neighbors.KNeighborsRegressor(n_neighbors=12,n_jobs=-1,p=2 )

In [48]:
knn.fit(train,train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=12, p=2,
          weights='uniform')

In [49]:
test_prediction = knn.predict(test)

In [50]:
knn.score(test,test_target)

-0.091218711077894588

In [51]:
predictionDF = pd.DataFrame(test_prediction,columns={"prediction"})

In [52]:
predictionDF.loc[:,'spectate'] = test_target.values

In [53]:

predictionDF.sample(5)

Unnamed: 0,prediction,spectate
19301,595.25,361
22933,473.583333,224
32448,493.583333,234
14054,656.083333,694
35829,446.083333,911


In [54]:
def predictError(predict,spectate):
    if (spectate > predict) : return  ((spectate-predict)/spectate)*100
    else  :  return ((predict-spectate)/predict )*100

In [55]:
predictionDF['errorValue'] = predictionDF.apply(axis=1,func=lambda x : predictError(x[0],x[1]))

In [56]:
predictionDF.sample(5)

Unnamed: 0,prediction,spectate,errorValue
41279,493.916667,298,39.665936
27461,500.75,286,42.885671
5403,422.5,742,43.059299
17891,550.666667,647,14.889232
25195,469.083333,751,37.538837


In [57]:
print 'El error promedio que comete KNN al predecir es de', float(predictionDF['errorValue'].mean()),'%'

El error promedio que comete KNN al predecir es de 34.1927472075 %


In [58]:
print "El algoritmo KNN predice un ",\
float(predictionDF[predictionDF['errorValue'] < 15].shape[0]) / float(predictionDF.shape[0]) *100 ,\
"% de los valores, con un error menor al 15%"


El algoritmo KNN predice un  18.85 % de los valores, con un error menor al 15%
