In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from pandas.tseries.holiday import USFederalHolidayCalendar as calendarUSFH


# Cargar set de datos

In [2]:
trip_full = pd.read_csv('data/trip.csv')

In [3]:
trip = pd.read_csv('data/trip_train.csv')

In [4]:
trip_test = pd.read_csv('data/trip_test.csv')

weather = pd.read_csv('data/weather.csv')

In [5]:
station = pd.read_csv('data/station.csv')

# Modificando set de datos

In [6]:
trip_test['start_station_name'] = trip_test['start_station_name']\
    .apply(lambda x : 'Washington at Kearney' if x == 'Washington at Kearny' else x  )

In [7]:
trip_test['start_station_name'] = trip_test['start_station_name']\
    .apply(lambda x : 'Post at Kearney' if x == 'Post at Kearny' else x  )

In [8]:
otherStation = pd.DataFrame([['Broadway at Main',94107],['San Jose Government Center',95113]]\
                            ,columns=['station','zip_code'])

In [9]:
stationReduced = station.loc[:,['id','name','city']]

def choseZipCode(city):
    if city == 'San Francisco' : return 94107
    elif city == 'San Jose' : return 95113
    elif city == 'Redwood City' : return 94063
    elif city == 'Palo Alto' : return 94301
    elif city == 'Mountain View' : return 94041

stationReduced['zip_code'] = stationReduced.loc[:,'city'].apply(choseZipCode)

stationReduced = stationReduced.drop(labels=['id','city'],axis=1)
stationReduced.rename(columns={'name':'station'},inplace=True)

In [10]:
stationReduced = stationReduced.append(otherStation,ignore_index=True)

In [11]:
trip.rename(columns={'start_station_name':'station'},inplace=True)

In [12]:
trip_test.rename(columns={'start_station_name':'station'},inplace=True)

In [13]:
trip_test = trip_test.drop(labels=['zip_code','end_date','end_station_name','end_station_id'],axis=1)

In [14]:
trip = trip.drop(labels=['zip_code','end_date','end_station_name','end_station_id'],axis=1)

weather['precipitation_inches'] = weather['precipitation_inches'].apply(lambda x : 0.005 if (x == 'T') else float(x))


weather = weather[weather.precipitation_inches.isnull() == False]

weather = weather.fillna(0)

weather['events'] = weather['events'].apply(lambda x : 'Normal' if x == 0 else x)
weather['events'] = weather['events'].apply(lambda x : 'Rain' if x == 'rain' else x)
weather['date'] = weather['date'].apply(lambda date : pd.to_datetime(date).date())

weather.events.unique()

weather['Normal'] = weather['events'].apply(lambda x : 1 if x == 'Normal' else 0)

weather['Fog'] = weather['events'].apply(lambda x : 1 if x == 'Fog' else 0)

weather['Rain'] = weather['events'].apply(lambda x : 1 if x == 'Rain' else 0)

weather['Fog-Rain'] = weather['events'].apply(lambda x : 1 if x == 'Fog-Rain' else 0)

weather['Rain-Thunderstorm'] = weather['events'].apply(lambda x : 1 if x == 'Rain-Thunderstorm' else 0)

weather.drop(axis=1,labels='events',inplace=True)

In [15]:
def splitDateAndTime(dateAndTime):
    splitLine = dateAndTime.split(' ')
    return str(splitDate(splitLine[0]))  + splitTime(splitLine[1])

def splitDate(date):
    lineSplit = date.split('/')
    return str(lineSplit[2]) + str(lineSplit[0] if int(lineSplit[0]) >= 10 else '0' + str(lineSplit[0])) + str(lineSplit[1] if int(lineSplit[1]) >= 10 else '0' + str(lineSplit[1]) )

def splitTime(time):
    lineSplit = time.split(':')
    return str(lineSplit[0] if int(lineSplit[0]) >= 10 else '0' + str(lineSplit[0])) + str(lineSplit[1])

def hour_to_minute(date) :
    return int(date.hour)* 60  + int(date.minute)

In [16]:
trip.loc[:,'start_date'] = trip['start_date'].apply(lambda x : pd.to_datetime(splitDateAndTime(x)))

In [17]:
trip_test.loc[:,'start_date'] = trip_test['start_date'].apply(lambda x : pd.to_datetime(splitDateAndTime(x)))

In [18]:
trip['time'] = trip['start_date'].apply(lambda x : hour_to_minute(x)) 

In [19]:
trip_test['time'] = trip_test['start_date'].apply(lambda x : hour_to_minute(x)) 

In [20]:
trip.sample()

Unnamed: 0,id,duration,start_date,station,start_station_id,bike_id,subscription_type,time
55189,17933,701,2013-09-11 15:10:00,Washington at Kearney,46,260,Customer,910


In [21]:
trip.loc[:,'day_of_week'] = trip['start_date'].apply(lambda x : x.dayofweek )

In [22]:
trip_test.loc[:,'day_of_week'] = trip_test['start_date'].apply(lambda x : x.dayofweek )

In [23]:
trip.loc[:,'month'] = trip['start_date'].apply(lambda x : x.month )

In [24]:
trip_test.loc[:,'month'] = trip_test['start_date'].apply(lambda x : x.month )

trip.loc[:,'year'] = trip['start_date'].apply(lambda x : x.year )

trip_test.loc[:,'year'] = trip_test['start_date'].apply(lambda x : x.year )

trip.loc[:,'dayofyear'] = trip['start_date'].apply(lambda x : x.dayofyear )

trip_test.loc[:,'dayofyear'] = trip_test['start_date'].apply(lambda x : x.dayofyear )

In [25]:
trip.loc[:,'start_date'] = trip['start_date'].apply(lambda x : pd.to_datetime(x).date())

In [26]:
trip_test.loc[:,'start_date'] = trip_test['start_date'].apply(lambda x : pd.to_datetime(x).date())

In [27]:
cal = calendarUSFH()
holidays = cal.holidays(return_name=True,start=pd.to_datetime('20130101'), end=pd.to_datetime('20151231'))
holiday_festive_day = pd.DataFrame(holidays,columns=['holiday']).reset_index()
holiday_festive_day.rename(columns={'index':'start_date'},inplace=True)

In [28]:
holiday_festive_day.loc[:,'start_date'] = holiday_festive_day['start_date'].apply(lambda x : pd.to_datetime(x).date())

In [29]:
trip = pd.merge(holiday_festive_day,trip,on=['start_date'],how='right')

In [30]:
trip_test = pd.merge(holiday_festive_day,trip_test,on=['start_date'],how='right')

In [31]:
trip_test.shape

(119998, 10)

In [32]:
trip.loc[:,'is_holiday'] = trip.holiday.isnull()


In [33]:
trip_test.loc[:,'is_holiday'] = trip_test.holiday.isnull()

In [34]:
trip.loc[:,'is_holiday'] = trip['is_holiday'].apply(lambda x : not x )


In [35]:
trip_test.loc[:,'is_holiday'] = trip_test['is_holiday'].apply(lambda x : not x )

In [36]:
trip.loc[:,'subscription_type'] = trip['subscription_type'].apply(lambda x : 0 if ('Subscriber' == x) else 1 )

In [37]:
trip_test.loc[:,'subscription_type'] = trip_test['subscription_type'].apply(lambda x : 0 if ('Subscriber' == x) else 1 )

In [38]:
trip.sample()

Unnamed: 0,start_date,holiday,id,duration,station,start_station_id,bike_id,subscription_type,time,day_of_week,month,is_holiday
23048,2015-07-27,,864043,327,Steuart at Market,74,507,0,1060,0,7,False


In [39]:
trip_test.sample()

Unnamed: 0,start_date,holiday,id,station,start_station_id,bike_id,subscription_type,time,day_of_week,month,is_holiday
3823,2014-08-22,,420429,Embarcadero at Vallejo,48,514,1,825,4,8,False


In [40]:
trip_test.shape

(119998, 11)

In [41]:
tripWithZipCode = pd.merge(trip,stationReduced,on=['station'],how='left')

In [42]:
tripWithZipCode_test = pd.merge(trip_test,stationReduced,on=['station'],how='left')

In [43]:
tripWithZipCode_test.shape

(119998, 12)

In [44]:
tripWithZipCode.sample()

Unnamed: 0,start_date,holiday,id,duration,station,start_station_id,bike_id,subscription_type,time,day_of_week,month,is_holiday,zip_code
126063,2014-05-12,,281048,581,San Francisco Caltrain (Townsend at 4th),70,532,0,1054,0,5,False,94107.0


In [45]:
tripWithZipCode_test.sample()

Unnamed: 0,start_date,holiday,id,station,start_station_id,bike_id,subscription_type,time,day_of_week,month,is_holiday,zip_code
87422,2015-01-14,,604282,2nd at Townsend,61,672,0,1035,2,1,False,94107


In [46]:
tripWithZipCode = tripWithZipCode.drop(labels=['holiday'],axis=1).dropna()

In [47]:
tripWithZipCode_test = tripWithZipCode_test.drop(labels=['holiday'],axis=1)

In [48]:
tripWithZipCode_test.shape

(119998, 11)

In [49]:
tripWithZipCode.rename(columns={'start_date':'date'},inplace=True)

In [50]:
tripWithZipCode_test.rename(columns={'start_date':'date'},inplace=True)

In [51]:
tripWithZipCode_test.shape

(119998, 11)

tripsWithWeather = pd.merge(tripWithZipCode,weather,on=['date','zip_code'],how='left')

tripsWithWeather_test = pd.merge(tripWithZipCode_test,weather,on=['date','zip_code'],how='left')

reduceWeather = weather.drop(labels=['date','zip_code'],axis=1)

for value in reduceWeather.columns :
    tripsWithWeather_test[value] = tripsWithWeather_test[value]\
            .apply(lambda x : tripsWithWeather_test[value].mean() if pd.isnull(x) else x)

In [52]:
tripWithZipCode.rename(columns={'start_station_id':'station_id'},inplace=True)

tripsWithWeather.rename(columns={'start_station_id':'station_id'},inplace=True)

In [53]:
tripWithZipCode_test.rename(columns={'start_station_id':'station_id'},inplace=True)

tripsWithWeather_test.rename(columns={'start_station_id':'station_id'},inplace=True)

In [54]:
tripWithZipCode_test.shape

(119998, 11)

tripsWithWeather = tripsWithWeather.dropna()

tripsWithWeather = tripsWithWeather[tripsWithWeather['duration'] < 1000]

tripsWithWeather.shape

tripsWithWeather_test.shape

In [55]:
test_id = tripWithZipCode_test['id']

test_id = tripsWithWeather_test['id']

test_target = tripsWithWeather['duration'][-200000:]

In [56]:
train_target = tripWithZipCode['duration']

train_target = tripsWithWeather['duration']

In [57]:
tripWithZipCodeReduced = tripWithZipCode.drop(labels=['duration','date','station','id'],axis=1)

tripsWithWeatherReduced = tripsWithWeather.drop(labels=['duration','date','station','id'],axis=1)

In [58]:
tripWithZipCodeReduced_test = tripWithZipCode_test.drop(labels=['date','station','id'],axis=1)

tripsWithWeatherReduced_test = tripsWithWeather_test.drop(labels=['date','station','id'],axis=1)

In [59]:
train = tripWithZipCodeReduced

In [60]:
train = train[['station_id','bike_id','subscription_type','time','day_of_week','month']]

In [61]:
train.sample()

Unnamed: 0,station_id,bike_id,subscription_type,time,day_of_week,month
94369,69,276,0,510,1,10


train = tripsWithWeatherReduced

In [62]:
test = tripWithZipCodeReduced_test

In [63]:
test = test[['station_id','bike_id','subscription_type','time','day_of_week','month']]

test = tripsWithWeatherReduced_test

In [64]:
test_true_duration = pd.DataFrame(trip_full[trip_full.id.isin(trip_test.id.values)]['duration'])
test_true_duration = test_true_duration.reset_index().rename(columns={'index':'id'})

test = tripsWithWeatherReduced[-200000:]

# Reduccion de dimensiones 

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=3)

tripTransform = svd.fit_transform(tripWithZipCodeReduced)

tripTransform = svd.fit_transform(tripsWithWeatherReduced)

tripTransform_test = svd.fit_transform(tripWithZipCodeReduced_test)

tripTransform_test = svd.fit_transform(tripsWithWeatherReduced_test)

%matplotlib inline

plt.style.use('default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (12, 4)

pd.DataFrame(svd.explained_variance_).plot()

pd.DataFrame(svd.explained_variance_).head(6).plot()

tripTransform.shape

tripTransform_test.shape

tripTransformDF = pd.DataFrame(tripTransform)


tripTransformDF_test = pd.DataFrame(tripTransform_test)

tripTransformDF = tripTransformDF.abs()


tripTransformDF_test = tripTransformDF_test.abs()

test = tripTransformDF[-200000:]

train = tripTransformDF

test = tripTransformDF_test

test.shape

import pandas as pd
from sklearn import preprocessing



min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train)
df_normalized = pd.DataFrame(np_scaled)
train = df_normalized

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(test)
df_normalized = pd.DataFrame(np_scaled)
test = df_normalized

test.shape

train.drop(axis=1,labels=0,inplace=True)

train.shape

train_target.shape

# Machine Learning

In [65]:
from sklearn import linear_model

In [66]:
rfr = linear_model.LogisticRegression(verbose=10,max_iter=1,n_jobs=-1)

In [None]:
rfr.fit(train[:5000],train_target[:5000])

In [None]:
test_prediction = rfr.predict(test)

In [None]:
mean_squared_error(test_true_duration['duration'],test_prediction)

In [None]:
rfr.score(test,test_true_duration['duration'])

### Conversion para entrega

predictionDF = pd.DataFrame(test_prediction,columns={"duration"})

predictionDF.loc[:,'id'] = test_id.values

In [None]:
predictionDF.to_csv(path_or_buf='prediction_nnr_svd_1',sep=',',header=True,columns=['id','duration'],index=False)

### CrossValidation
#### En caso de querer usar parte del set de entrenamiento como set de prueba 

rfr.score(test,test_target)

from sklearn.metrics import mean_squared_error

mean_squared_error(test_target,test_prediction)

En caso de ser necesario de Cross Validation

predictionDF = pd.DataFrame(test_prediction,columns={"prediction"})

predictionDF.loc[:,'spectate'] = test_target.values

def predictError(predict,spectate):
    if (spectate > predict) : return  ((spectate-predict)/spectate)*100
    else  :  return ((predict-spectate)/predict )*100

predictionDF['errorValue'] = predictionDF.apply(axis=1,func=lambda x : predictError(x[0],x[1]))

predictionDF.sample(5)

print 'El error promedio que comete Random Forest Regresion al predecir es de', float(predictionDF['errorValue'].mean()),'%'

print "El algoritmo Random Forest Regresion predice un ",\
float(predictionDF[predictionDF['errorValue'] < 15].shape[0]) / float(predictionDF.shape[0]) *100 ,\
"% de los valores, con un error menor al 15%"


predictionDF.describe()