<a href="https://colab.research.google.com/github/muoyo/chicago-ridesharing/blob/master/notebooks/rideshare_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sodapy

In [16]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
from sodapy import Socrata
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_squared_log_error
%matplotlib inline

In [17]:
def get_random_samples(client, num_samples=200, sample_size=1000, verbose=False):
    
    start = time.time()

    # Perform a $select=count(*) query to determine how large the set is
    results = client.get("m6dm-c72p", select='count(*)' )
    total_rows = int(results[0].get('count', 0))
    row_indices = np.arange(0, total_rows, sample_size)
    results = []

    # Use rand() locally to come up with some offsets
    sample_offsets = np.random.choice(row_indices, size=num_samples, replace=False)


    # Use $limit and $offset in conjunction with a stable $order to pick out individual records. 
    # Ex: $order=facility_id&$limit=1&$offset=<some rand() number>
    for i, offset in enumerate (sample_offsets):

        if verbose:
            print(f'Sample {i}: offset={offset},sample_size={sample_size}')
            print('Pure Python time:', time.time() - start, 'sec.')
        results.extend(client.get("m6dm-c72p", order='trip_id', limit=sample_size, offset=offset, 
                                                  select='''trip_id, trip_start_timestamp, pickup_community_area, fare, tip, trip_total'''))
        
    if verbose:
        print('Pure Python time:', time.time() - start, 'sec.')

    return results

In [18]:
# %run ../python_files/utils
# %run ../python_files/data_cleaning

# df = get_trip_records(100000)
# df = clean_data(df)
# df

In [19]:
# samples = get_random_samples(client, verbose=True )
# samples[:10]

In [20]:
# samples_df = pd.DataFrame.from_records(samples)
# samples_df

In [21]:
client = Socrata('data.cityofchicago.org',
             'Tk6RhuGAFvF9P4ehsysybj3IW',
             username="mokome@gmail.com",
             password="Ch1cago!!")

client.timeout = 10000

results = client.get("m6dm-c72p", limit=100000, select='trip_id, trip_start_timestamp, trip_end_timestamp, trip_seconds, trip_miles, pickup_community_area, dropoff_community_area, fare, tip, additional_charges, trip_total' )

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
results_df

Unnamed: 0,trip_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tip,additional_charges,trip_total
0,31881154328f0e8829fa61215edd2c4f6897bb56,2019-07-19T08:00:00.000,2019-07-19T08:15:00.000,835,2.15573706224,24,8,7.5,0,2.55,10.05
1,3188131b970f8bd6b38f494f9a8513820b784384,2019-09-27T19:30:00.000,2019-09-27T19:30:00.000,536,1.86615116688,28,34,7.5,0,2.55,10.05
2,318813e4563c839011469533c0df7960b2e26ba4,2019-08-22T19:00:00.000,2019-08-22T19:15:00.000,992,3.587869,32,24,10,0,2.55,12.55
3,318814d82a07c53860c4eb08a21ff2403edbe9b5,2019-07-05T13:15:00.000,2019-07-05T13:30:00.000,899,2.863703,32,28,7.5,3,2.55,13.05
4,3188151379849598b97f306682832dd07e425743,2019-07-15T19:30:00.000,2019-07-15T20:00:00.000,1153,10.896428,62,24,15,0,2.55,17.55
...,...,...,...,...,...,...,...,...,...,...,...
99995,3277980a4bf4e6086c2af62f8d56645c811fb0b2,2019-09-22T17:00:00.000,2019-09-22T17:00:00.000,276,1.167083,24,24,5,0,2.55,7.55
99996,3277998c04c3c793114d0d9280d36d29bead28c1,2019-08-10T12:15:00.000,2019-08-10T12:45:00.000,2164,8.445811,28,6,17.5,0,2.55,20.05
99997,327799d0e305c2263129352b9da90c1ae87528bf,2019-08-11T10:15:00.000,2019-08-11T10:45:00.000,1355,8.44646242432,4,32,15,4,2.55,21.55
99998,327799fe68bc9a750e45d5dfddedc834bc37395e,2019-09-11T14:30:00.000,2019-09-11T14:45:00.000,432,2.43254,43,48,5,0,2.55,7.55


In [22]:
columns_to_use = ['trip_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds',
       'trip_miles', 'pickup_community_area', 'fare', 'tip',
       'additional_charges', 'trip_total' ]

columns_to_drop = [ col for col in results_df.columns if col not in columns_to_use ]
df = results_df.drop(columns=columns_to_drop)

df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])
df['trip_seconds'] = df['trip_seconds'].fillna('0')
df['trip_seconds'] = df['trip_seconds'].astype('int64')

for col in ['trip_miles', 'fare', 'tip', 'additional_charges', 'trip_total']:
    df[col] = df[col].astype(float) 

df['start_weekday'] = df['trip_start_timestamp'].apply(lambda d: d.weekday())
df['start_hour'] = df['trip_start_timestamp'].apply(lambda d: d.hour)
df['start_time_block'] = df['start_hour'] // 3

df['start_date_plus_hour'] = df['trip_start_timestamp'].apply(lambda d: datetime(d.year, d.month, d.day, d.hour))

weather_df = pd.read_csv('../data/chicago_weather.csv')
weather_df['hour'] = weather_df['hour'].apply(lambda x: '{:02d}'.format(x))
weather_df['start_date_plus_hour'] = pd.to_datetime(weather_df['date'] + ' ' + weather_df['hour'] + ':00:00')
weather_df = weather_df.rename(columns={'icon': 'precip'})

# def set_precip(precip):
#     if precip not in ['rain', 'snow']: 
#         precip = 'clear'

#     return precip

# weather_df['precip'] = weather_df['precip'].apply(set_precip)
precip_df = weather_df[['start_date_plus_hour', 'precip', 'apparentTemperature']]

df = df.merge(precip_df, how='left', on='start_date_plus_hour')
df.head()

Unnamed: 0,trip_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,fare,tip,additional_charges,trip_total,start_weekday,start_hour,start_time_block,start_date_plus_hour,precip,apparentTemperature
0,31881154328f0e8829fa61215edd2c4f6897bb56,2019-07-19 08:00:00,2019-07-19 08:15:00,835,2.155737,24,7.5,0.0,2.55,10.05,4,8,2,2019-07-19 08:00:00,partly-cloudy-day,87.7
1,3188131b970f8bd6b38f494f9a8513820b784384,2019-09-27 19:30:00,2019-09-27 19:30:00,536,1.866151,28,7.5,0.0,2.55,10.05,4,19,6,2019-09-27 19:00:00,rain,68.38
2,318813e4563c839011469533c0df7960b2e26ba4,2019-08-22 19:00:00,2019-08-22 19:15:00,992,3.587869,32,10.0,0.0,2.55,12.55,3,19,6,2019-08-22 19:00:00,partly-cloudy-day,71.46
3,318814d82a07c53860c4eb08a21ff2403edbe9b5,2019-07-05 13:15:00,2019-07-05 13:30:00,899,2.863703,32,7.5,3.0,2.55,13.05,4,13,4,2019-07-05 13:00:00,clear-day,94.0
4,3188151379849598b97f306682832dd07e425743,2019-07-15 19:30:00,2019-07-15 20:00:00,1153,10.896428,62,15.0,0.0,2.55,17.55,0,19,6,2019-07-15 19:00:00,clear-day,87.16


In [9]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 16 columns):
trip_id                  100000 non-null object
trip_start_timestamp     100000 non-null datetime64[ns]
trip_end_timestamp       100000 non-null datetime64[ns]
trip_seconds             100000 non-null int64
trip_miles               99993 non-null float64
pickup_community_area    93718 non-null object
fare                     100000 non-null float64
tip                      100000 non-null float64
additional_charges       100000 non-null float64
trip_total               100000 non-null float64
start_weekday            100000 non-null int64
start_hour               100000 non-null int64
start_time_block         100000 non-null int64
start_date_plus_hour     100000 non-null datetime64[ns]
precip                   100000 non-null object
apparentTemperature      100000 non-null float64
dtypes: datetime64[ns](3), float64(6), int64(4), object(3)
memory usage: 13.0+ MB


In [34]:
# Limit to the columns we are interested in: 
# 'apparentTemperature', 'start_weekday', 'start_hour', (OR 'start_time_block'), 'pickup_community_area' 

columns_to_use = ['apparentTemperature', 'start_weekday', 'start_hour', 'pickup_community_area']
columns_to_drop = [ col for col in df.columns if col not in columns_to_use ]

# Use dependent variables listed above to predict the independent variable: 'trip_total' OR 'fare'
X = df.drop(columns=columns_to_drop)
y = df['trip_total']


# deal with any null values
X['apparentTemperature']=X['apparentTemperature'].fillna(X['apparentTemperature'].median())
X['pickup_community_area']=X['pickup_community_area'].fillna('0')


# Split out continuous & categorical variables
cont_cols = ['apparentTemperature']
cat_cols = [ col for col in columns_to_use if col not in cont_cols ]

enc = OneHotEncoder()

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

X_train_cont = X_train[cont_cols]
X_train_cat = X_train[cat_cols]

# ss = StandardScaler()
# X_train_cont = pd.DataFrame(ss.fit_transform(X_train_cont))
X_train_cont = (X_train_cont - X_train_cont.mean()) / (X_train_cont.max() - X_train_cont.min())

# Transform training set
X_train_enc = enc.fit_transform(X_train_cat, y_train)

# Convert these columns into a DataFrame 
columns = enc.get_feature_names(input_features=X_train_cat.columns)
X_train_cat = pd.DataFrame(X_train_enc.todense(), columns=columns, index=X_train.index)

# Combine categorical and continuous features into the final dataframe
X_train = pd.concat([X_train_cont, X_train_cat], axis=1)
X_train_const = sm.add_constant(X_train)

# Fit model & show summary
model = sm.OLS(y_train,X_train_const).fit()
model.summary()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,trip_total,R-squared:,0.284
Model:,OLS,Adj. R-squared:,0.283
Method:,Least Squares,F-statistic:,277.4
Date:,"Fri, 24 Jan 2020",Prob (F-statistic):,0.0
Time:,22:24:12,Log-Likelihood:,-279330.0
No. Observations:,75000,AIC:,558900.0
Df Residuals:,74892,BIC:,559900.0
Df Model:,107,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,12.4125,0.076,163.281,0.000,12.263,12.561
apparentTemperature,-0.3713,0.262,-1.419,0.156,-0.884,0.142
start_weekday_0.0,1.7193,0.098,17.456,0.000,1.526,1.912
start_weekday_1.0,1.5456,0.098,15.716,0.000,1.353,1.738
start_weekday_2.0,1.7978,0.095,18.831,0.000,1.611,1.985
start_weekday_3.0,2.1370,0.091,23.430,0.000,1.958,2.316
start_weekday_4.0,1.8886,0.085,22.309,0.000,1.723,2.055
start_weekday_5.0,1.6109,0.083,19.469,0.000,1.449,1.773
start_weekday_6.0,1.7132,0.092,18.544,0.000,1.532,1.894

0,1,2,3
Omnibus:,65549.38,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7168847.033
Skew:,3.719,Prob(JB):,0.0
Kurtosis:,50.315,Cond. No.,1.12e+16


In [69]:
# %run ../python_files/regression

# # Split data into training and test sets
# X_train, X_test, y_train, y_test = get_train_test_split(df, test_size=0.25)
# X_train_const = sm.add_constant(X_train)

# # Fit model & show summary
# OLS(y_train,X_train_const)


In [70]:
# LinearRegression(X_train, y_train)

In [35]:
# Fit the model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Print R2 and MSE for training set
print('Training r^2:', linreg.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train)))

linreg.coef_

Training r^2: 0.2837464547770211
Training MSE: 100.57778168573569


array([-3.73834079e-01,  8.15109769e+13,  8.15109769e+13,  8.15109769e+13,
        8.15109769e+13,  8.15109769e+13,  8.15109769e+13,  8.15109769e+13,
       -2.35384989e+13, -2.35384989e+13, -2.35384989e+13, -2.35384989e+13,
       -2.35384989e+13, -2.35384989e+13, -2.35384989e+13, -2.35384989e+13,
       -2.35384989e+13, -2.35384989e+13, -2.35384989e+13, -2.35384989e+13,
       -2.35384989e+13, -2.35384989e+13, -2.35384989e+13, -2.35384989e+13,
       -2.35384989e+13, -2.35384989e+13, -2.35384989e+13, -2.35384989e+13,
       -2.35384989e+13, -2.35384989e+13, -2.35384989e+13, -2.35384989e+13,
        5.56833557e+12,  5.56833557e+12,  5.56833557e+12,  5.56833557e+12,
        5.56833557e+12,  5.56833557e+12,  5.56833557e+12,  5.56833557e+12,
        5.56833557e+12,  5.56833557e+12,  5.56833557e+12,  5.56833557e+12,
        5.56833557e+12,  5.56833557e+12,  5.56833557e+12,  5.56833557e+12,
        5.56833557e+12,  5.56833557e+12,  5.56833557e+12,  5.56833557e+12,
        5.56833557e+12,  

In [40]:
# Lasso(X_train, y_train)

In [38]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=1) # Lasso is also known as the L1 norm 
lasso.fit(X_train, y_train)

print('Training r^2:', lasso.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, lasso.predict(X_train)))

lasso.coef_

Training r^2: 0.0
Training MSE: 140.42203680042454


array([-0.,  0.,  0.,  0.,  0., -0., -0., -0., -0., -0., -0., -0.,  0.,
        0.,  0.,  0., -0., -0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
       -0., -0., -0., -0., -0., -0.,  0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0.,  0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0.,  0., -0., -0., -0., -0.,
       -0., -0.,  0., -0., -0.,  0.,  0., -0., -0.,  0.,  0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0.,  0., -0.,  0., -0.])

In [41]:
# Ridge(X_train, y_train)

Ridge(alpha=       apparentTemperature  start_weekday_0.0  start_weekday_1.0  \
38256             0.048947                0.0                1.0   
38694            -0.064473                1.0                0.0   
62878             0.128963                0.0                0.0   
3778             -0.198091                0.0                1.0   
106               0.048364                0.0                1.0   
...                    ...                ...                ...   
92980             0.117698                0.0                1.0   
22806             0.189169                0.0                0.0   
44939             0.085071                0.0                0.0   
9977             -0.145460                0.0                0.0   
65664             0.064096                0.0                0.0   

       start_weekday_2.0  start_weekday_3.0  start_weekday_4.0  \
38256                0.0                0.0                0.0   
38694                0.0               

In [42]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=10) # Ridge is also known as the L2 norm
ridge.fit(X_train, y_train)

print('Training r^2:', ridge.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, ridge.predict(X_train)))

ridge.coef_

Training r^2: 0.2838234256496749
Training MSE: 100.56697327902333


array([-0.3751483 , -0.0510355 , -0.22654413,  0.0256785 ,  0.36369559,
        0.11378174, -0.16593971, -0.05963648, -1.16656486, -1.776313  ,
       -1.53137592,  0.05272612,  3.68708942,  4.58978814,  3.06227829,
        0.64939948, -0.44482885, -0.5913494 , -0.1211509 ,  0.44949911,
        0.36213381,  0.08167645,  0.31357105,  0.87339306,  0.50166312,
        0.19113567, -1.04428307, -1.2531568 , -1.68906995, -1.86807576,
       -1.60051731, -1.72766791, 13.44746293,  0.30916703, -1.2317082 ,
       -0.42074068, -0.61231915, -0.72702904, -0.5886516 , -1.97841401,
       -0.63382312,  0.18331144,  1.0190036 ,  0.48712961,  1.92151649,
       -1.30196274,  0.07469235, -0.43385258, -0.02361705,  0.19385749,
       -0.8925505 , -0.95959313, -1.03056936, -1.60328934, -1.4598328 ,
       -1.90983419, -1.96354384, -1.20579905, -1.99871043, -1.69403874,
       -1.50448069, -1.81511336, -0.94388321, -0.99900994,  0.30458026,
        1.26074038, -0.40653295, -1.73946402, -1.49716196, -4.03