In [1]:
import pandas as pd
import numpy as np

In [2]:
station = pd.read_csv('data/station.csv', usecols=['id','city'])
station.head(3)

Unnamed: 0,id,city
0,2,San Jose
1,3,San Jose
2,4,San Jose


In [3]:
trip = pd.read_csv('data/trip.csv',
                   usecols=['id', 'duration', 'start_date', 'start_station_id', 'subscription_type'])
trip.head(3)

Unnamed: 0,id,duration,start_date,start_station_id,subscription_type
0,4576,63,8/29/2013 14:13,66,Subscriber
1,4607,70,8/29/2013 14:42,10,Subscriber
2,4130,71,8/29/2013 10:16,27,Subscriber


In [4]:
weather = pd.read_csv('data/weather.csv')

In [5]:
def preprocess(station, weather, trip):
    station = preprocessStation(station) #OK
    weather = preprocessWeather(weather) #OK
    trip = preprocessTrip(trip)
    trip = trip.merge(station, on='start_station_id', how='left')
    trip = trip.merge(weather, on=['date', 'zip_code'], how='left')
    #SOLUCION RATA
    trip.fillna(0, inplace=True)
    trip.drop(['start_date', 'zip_code'], axis=1, inplace=True)
    return encode(trip)
    
def preprocessStation(df):
    df = df.copy()
    df["zip_code"] = df.city.apply(zipCode)
    df.drop(labels=['city'], axis=1, inplace=True)
    df.rename(columns={'id': 'start_station_id'}, inplace=True)
    return df

def preprocessTrip(df):
    df = df.copy()
    df['start_date'] = df.start_date.apply(lambda x: pd.to_datetime(x))
    df['time'] = df.start_date.apply(lambda x: 60 * x.hour + x.minute)
    df['year'] = df.start_date.apply(lambda x : x.year)
    df['month'] = df.start_date.apply(lambda x : x.month)
    df['day'] = df.start_date.apply(lambda x : x.day)
    df['dayofweek'] = df.start_date.apply(lambda x : x.isoweekday())
    df['subscription_type'] = df.subscription_type.apply(lambda x: 1 if x == "Subscriber" else 0)
    df['start_date'] = df.start_date.apply(lambda x: x.date().isoformat())
    df = df.merge(holidays, left_on='start_date', right_on='date', how='left')
    df.rename(columns={'start_date':'date'}, inplace=True)
    return df

def preprocessWeather(df):
    df = df.copy()
    cleanPrecipitation(df)
    cleanEvents(df)
    df = df.fillna(0)
    df['date'] = df.date.apply(lambda x: pd.to_datetime(x).date())
    numberEvents(df)
    return df

def numberEvents(df):
    df['events'] = df.events.apply(eventNumber)

def eventNumber(eventString):
    if eventString == 'Normal': return 0
    elif eventString == 'Rain': return 1
    elif eventString == 'Fog': return 2
    elif eventString == 'Fog-Rain': return 3
    else: return 4

def zipCode(city):
    if city == 'San Francisco' : return 94107
    elif city == 'San Jose' : return 95113
    elif city == 'Redwood City' : return 94063
    elif city == 'Palo Alto' : return 94301
    elif city == 'Mountain View' : return 94041

def cleanPrecipitation(df):
    df.dropna(subset=["precipitation_inches"], inplace=True)
    df['precipitation_inches'] = df.precipitation_inches.apply(lambda x:
                                                               0.005 if (x == 'T')
                                                               else float(x))
def cleanEvents(df):
    df['events'] = df.events.apply(lambda x: "Normal" if pd.isnull(x) else x)
    df['events'] = df.events.apply(lambda x: "Rain" if x == "rain" else x)

def encode(df):
    vectorcito = (1, 2, 4, 5, 6, 7, 28)
    print("Encoding:")
    print(df.iloc[:, vectorcito].dtypes)
    encoder = sklearn.preprocessing.OneHotEncoder(categorical_features=vectorcito)
    return pd.DataFrame(encoder.fit_transform(df).toarray())

from pandas.tseries.holiday import USFederalHolidayCalendar as calendarUSFH
cal = calendarUSFH()
holidays = cal.holidays(return_name=True, start=pd.to_datetime('20130101'), end=pd.to_datetime('20151231'))
holidays = holidays.to_frame().reset_index().rename(columns={'index': 'date', 0: 'holiday'})
holidays['date'] = holidays.date.apply(lambda x: x.date().isoformat())

import sklearn.preprocessing

In [6]:
s = preprocessStation(station)
s.to_csv('data/stationP', index=False)
w = preprocessWeather(weather)
w.to_csv('data/weatherP', index=False)
t = preprocessTrip(trip)
t.to_csv('data/tripP', index=False)

In [7]:
t.sample(10)

Unnamed: 0,id,duration,date,start_station_id,subscription_type,time,year,month,day,dayofweek,date.1,holiday
501380,667118,511,2015-03-04,51,1,548,2015,3,4,3,,
667939,435838,576,2014-09-03,73,1,529,2014,9,3,3,,
186911,373138,346,2014-07-21,68,1,1086,2014,7,21,1,,
335533,887753,668,2015-08-13,70,1,447,2015,8,13,4,,
510153,654487,358,2015-02-23,63,1,998,2015,2,23,1,,
613776,511917,221,2014-10-23,64,1,668,2014,10,23,4,,
472796,707031,310,2015-04-01,73,1,489,2015,4,1,3,,
202349,351640,678,2014-07-05,50,1,1045,2014,7,5,6,,
472797,707027,523,2015-04-01,67,1,488,2015,4,1,3,,
239431,301751,693,2014-05-28,41,1,1115,2014,5,28,3,,
