In [1]:
import pandas as pd
import numpy as np

In [2]:
station = pd.read_csv('data/station.csv', usecols=['id','city'])
station.head(3)

Unnamed: 0,id,city
0,2,San Jose
1,3,San Jose
2,4,San Jose


In [19]:
trip = pd.read_csv('data/trip_train.csv',
                   usecols=['id', 'duration', 'start_date', 'start_station_id', 'subscription_type'])
trip.head(3)

Unnamed: 0,id,duration,start_date,start_station_id,subscription_type
0,907649,396,8/27/2015 8:36,50,Subscriber
1,384043,636,7/28/2014 22:06,67,Subscriber
2,316176,334,6/9/2014 8:42,77,Subscriber


In [20]:
weather = pd.read_csv('data/weather.csv')

In [21]:
def preprocess(station, weather, trip):
    station = preprocessStation(station) #OK
    weather = preprocessWeather(weather) #OK
    trip = preprocessTrip(trip)
    trip = trip.merge(station, on='start_station_id', how='left')
    trip = trip.merge(weather, on=['date', 'zip_code'], how='left')
    #SOLUCION RATA
    trip.fillna(0, inplace=True)
    trip.drop(['start_date', 'zip_code'], axis=1, inplace=True)
    return encode(trip)
    
def preprocessStation(df):
    df = df.copy()
    df["zip_code"] = df.city.apply(zipCode)
    df.drop(labels=['city'], axis=1, inplace=True)
    df.rename(columns={'id': 'start_station_id'}, inplace=True)
    return df

def preprocessTrip(df):
    df = df.copy()
    df['start_date'] = df.start_date.apply(lambda x: pd.to_datetime(x))
    df['time'] = df.start_date.apply(lambda x: 60 * x.hour + x.minute)
    df['year'] = df.start_date.apply(lambda x : x.year)
    df['month'] = df.start_date.apply(lambda x : x.month)
    df['day'] = df.start_date.apply(lambda x : x.day)
    df['dayofweek'] = df.start_date.apply(lambda x : x.isoweekday())
    df['subscription_type'] = df.subscription_type.apply(lambda x: 1 if x == "Subscriber" else 0)
    df['start_date'] = df.start_date.apply(lambda x: x.date().isoformat())
    df = df.merge(holidays, left_on='start_date', right_on='date', how='left')
    df.rename(columns={'start_date':'date'}, inplace=True)
    return df

def preprocessWeather(df):
    df = df.copy()
    cleanPrecipitation(df)
    cleanEvents(df)
    df = df.fillna(0)
    df['date'] = df.date.apply(lambda x: pd.to_datetime(x).date())
    numberEvents(df)
    return df

def numberEvents(df):
    df['events'] = df.events.apply(eventNumber)

def eventNumber(eventString):
    if eventString == 'Normal': return 0
    elif eventString == 'Rain': return 1
    elif eventString == 'Fog': return 2
    elif eventString == 'Fog-Rain': return 3
    else: return 4

def zipCode(city):
    if city == 'San Francisco' : return 94107
    elif city == 'San Jose' : return 95113
    elif city == 'Redwood City' : return 94063
    elif city == 'Palo Alto' : return 94301
    elif city == 'Mountain View' : return 94041

def cleanPrecipitation(df):
    df.dropna(subset=["precipitation_inches"], inplace=True)
    df['precipitation_inches'] = df.precipitation_inches.apply(lambda x:
                                                               0.005 if (x == 'T')
                                                               else float(x))
def cleanEvents(df):
    df['events'] = df.events.apply(lambda x: "Normal" if pd.isnull(x) else x)
    df['events'] = df.events.apply(lambda x: "Rain" if x == "rain" else x)

def encode(df):
    vectorcito = (1, 2, 4, 5, 6, 7, 28)
    print("Encoding:")
    print(df.iloc[:, vectorcito].dtypes)
    encoder = sklearn.preprocessing.OneHotEncoder(categorical_features=vectorcito)
    return pd.DataFrame(encoder.fit_transform(df).toarray())

from pandas.tseries.holiday import USFederalHolidayCalendar as calendarUSFH
cal = calendarUSFH()
holidays = cal.holidays(return_name=True, start=pd.to_datetime('20130101'), end=pd.to_datetime('20151231'))
holidays = holidays.to_frame().reset_index().rename(columns={'index': 'date', 0: 'holiday'})
holidays['date'] = holidays.date.apply(lambda x: x.date().isoformat())

import sklearn.preprocessing

In [22]:
s = preprocessStation(station)
s.to_csv('data/stationP', index=False)
w = preprocessWeather(weather)
w.to_csv('data/weatherP', index=False)
t = preprocessTrip(trip)
t.to_csv('data/tripP', index=False)

In [23]:
t.sample(10)

Unnamed: 0,id,duration,date,start_station_id,subscription_type,time,year,month,day,dayofweek,date.1,holiday
330613,249912,648,2014-04-16,69,1,446,2014,4,16,3,,
308057,16094,337,2013-09-09,61,1,1359,2013,9,9,1,,
300471,749939,591,2015-05-01,48,1,1081,2015,5,1,5,,
315055,58964,278,2013-10-15,28,1,774,2013,10,15,2,,
87929,354290,173,2014-07-08,74,1,521,2014,7,8,2,,
240449,896143,739,2015-08-19,65,1,514,2015,8,19,3,,
220937,732984,378,2015-04-20,57,1,964,2015,4,20,1,,
77702,214475,442,2014-03-14,77,1,1043,2014,3,14,5,,
144545,827148,633,2015-06-29,67,1,1059,2015,6,29,1,,
291533,314206,364,2014-06-06,45,1,1034,2014,6,6,5,,
