<a href="https://colab.research.google.com/github/muoyo/chicago-ridesharing/blob/master/notebooks/rideshare_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sodapy

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
from sodapy import Socrata
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_squared_log_error
%matplotlib inline

In [2]:
def get_random_samples(client, num_samples=200, sample_size=1000, verbose=False):
    
    start = time.time()

    # Perform a $select=count(*) query to determine how large the set is
    results = client.get("m6dm-c72p", select='count(*)' )
    total_rows = int(results[0].get('count', 0))
    row_indices = np.arange(0, total_rows, sample_size)
    results = []

    # Use rand() locally to come up with some offsets
    sample_offsets = np.random.choice(row_indices, size=num_samples, replace=False)


    # Use $limit and $offset in conjunction with a stable $order to pick out individual records. 
    # Ex: $order=facility_id&$limit=1&$offset=<some rand() number>
    for i, offset in enumerate (sample_offsets):

        if verbose:
            print(f'Sample {i}: offset={offset},sample_size={sample_size}')
            print('Pure Python time:', time.time() - start, 'sec.')
        results.extend(client.get("m6dm-c72p", order='trip_id', limit=sample_size, offset=offset, 
                                                  select='''trip_id, trip_start_timestamp, pickup_community_area, fare, tip, trip_total'''))
        
    if verbose:
        print('Pure Python time:', time.time() - start, 'sec.')

    return results

In [3]:
# %run ../python_files/utils
# %run ../python_files/data_cleaning

# df = get_trip_records(100000)
# df = clean_data(df)
# df

In [4]:
# samples = get_random_samples(client, verbose=True )
# samples[:10]

In [5]:
# samples_df = pd.DataFrame.from_records(samples)
# samples_df

In [6]:
client = Socrata('data.cityofchicago.org',
             'Tk6RhuGAFvF9P4ehsysybj3IW',
             username="mokome@gmail.com",
             password="Ch1cago!!")

client.timeout = 10000

results = client.get("m6dm-c72p", limit=100000, select='trip_id, trip_start_timestamp, trip_end_timestamp, trip_seconds, trip_miles, pickup_community_area, dropoff_community_area, fare, tip, additional_charges, trip_total' )

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
results_df

Unnamed: 0,trip_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tip,additional_charges,trip_total
0,5148f64fd14b8e142e0c5fb1c2ed58f6aa38743d,2019-02-07T14:30:00.000,2019-02-07T15:15:00.000,3350,20.6402523052006,57,51,17.5,0,2.55,20.05
1,5148f6549b83e7114cd66caa2c5ba764099e89ca,2019-02-14T17:30:00.000,2019-02-14T17:45:00.000,896,3.09446962653971,27,30,7.5,0,2.55,10.05
2,5148f6991e0425097fe26b84496989bcfae469aa,2018-11-26T10:45:00.000,2018-11-26T11:00:00.000,364,1.26010158918999,14,4,5,0,2.5,7.5
3,5148f7249b5360d81894984073f4abfc7846bfd0,2019-03-24T11:00:00.000,2019-03-24T11:00:00.000,457,1.7586439568,2,,5,0,2.55,7.55
4,5148f7bb09610e6c78b4cdbea4d2afb1cbaa8d57,2018-11-26T11:15:00.000,2018-11-26T11:30:00.000,900,7.49194754176,3,8,12.5,0,2.5,15
...,...,...,...,...,...,...,...,...,...,...,...
99995,520da896c737782d88b94928940165d00dced496,2019-03-27T19:30:00.000,2019-03-27T20:15:00.000,2651,32.25402731088,,8,85,0,2.93,87.93
99996,520da8a8822861cfccbfe6c37e1fd15b523f9d54,2019-03-03T01:15:00.000,2019-03-03T01:30:00.000,827,2.93505883774111,22,16,5,0,2.55,7.55
99997,520da915de3c34254ac1039af77786fac6609d48,2019-03-15T23:45:00.000,2019-03-15T23:45:00.000,371,1.15667286437461,8,8,5,0,2.55,7.55
99998,520daa2380c8500de5641f246244352a3c21bbed,2019-01-20T15:30:00.000,2019-01-20T15:30:00.000,724,3.4972145441338,7,8,7.5,0,2.55,10.05


In [7]:
columns_to_use = ['trip_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds',
       'trip_miles', 'pickup_community_area', 'fare', 'tip',
       'additional_charges', 'trip_total' ]

columns_to_drop = [ col for col in results_df.columns if col not in columns_to_use ]
df = results_df.drop(columns=columns_to_drop)

df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])
df['trip_seconds'] = df['trip_seconds'].fillna('0')
df['trip_seconds'] = df['trip_seconds'].astype('int64')
df['pickup_community_area']=df['pickup_community_area'].fillna('0')


for col in ['trip_miles', 'fare', 'tip', 'additional_charges', 'trip_total']:
    df[col] = df[col].astype(float) 

df['start_weekday'] = df['trip_start_timestamp'].apply(lambda d: d.weekday())
df['start_hour'] = df['trip_start_timestamp'].apply(lambda d: d.hour)
df['start_time_block'] = df['start_hour'] // 3

df['start_date_plus_hour'] = df['trip_start_timestamp'].apply(lambda d: datetime(d.year, d.month, d.day, d.hour))

weather_df = pd.read_csv('../data/chicago_weather.csv')
weather_df['hour'] = weather_df['hour'].apply(lambda x: '{:02d}'.format(x))
weather_df['start_date_plus_hour'] = pd.to_datetime(weather_df['date'] + ' ' + weather_df['hour'] + ':00:00')
weather_df = weather_df.rename(columns={'icon': 'precip'})

# def set_precip(precip):
#     if precip not in ['rain', 'snow']: 
#         precip = 'clear'

#     return precip

# weather_df['precip'] = weather_df['precip'].apply(set_precip)
precip_df = weather_df[['start_date_plus_hour', 'precip', 'apparentTemperature']]
precip_df['apparentTemperature']=precip_df['apparentTemperature'].fillna(precip_df['apparentTemperature'].median())


df = df.merge(precip_df, how='left', on='start_date_plus_hour')
df.head()

Unnamed: 0,trip_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,fare,tip,additional_charges,trip_total,start_weekday,start_hour,start_time_block,start_date_plus_hour,precip,apparentTemperature
0,5148f64fd14b8e142e0c5fb1c2ed58f6aa38743d,2019-02-07 14:30:00,2019-02-07 15:15:00,3350,20.640252,57,17.5,0.0,2.55,20.05,3,14,4,2019-02-07 14:00:00,rain,35.14
1,5148f6549b83e7114cd66caa2c5ba764099e89ca,2019-02-14 17:30:00,2019-02-14 17:45:00,896,3.09447,27,7.5,0.0,2.55,10.05,3,17,5,2019-02-14 17:00:00,partly-cloudy-day,36.12
2,5148f6991e0425097fe26b84496989bcfae469aa,2018-11-26 10:45:00,2018-11-26 11:00:00,364,1.260102,14,5.0,0.0,2.5,7.5,0,10,3,2018-11-26 10:00:00,cloudy,16.25
3,5148f7249b5360d81894984073f4abfc7846bfd0,2019-03-24 11:00:00,2019-03-24 11:00:00,457,1.758644,2,5.0,0.0,2.55,7.55,6,11,3,2019-03-24 11:00:00,cloudy,44.16
4,5148f7bb09610e6c78b4cdbea4d2afb1cbaa8d57,2018-11-26 11:15:00,2018-11-26 11:30:00,900,7.491948,3,12.5,0.0,2.5,15.0,0,11,3,2018-11-26 11:00:00,cloudy,16.42


In [8]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 16 columns):
trip_id                  100000 non-null object
trip_start_timestamp     100000 non-null datetime64[ns]
trip_end_timestamp       100000 non-null datetime64[ns]
trip_seconds             100000 non-null int64
trip_miles               100000 non-null float64
pickup_community_area    100000 non-null object
fare                     100000 non-null float64
tip                      100000 non-null float64
additional_charges       100000 non-null float64
trip_total               100000 non-null float64
start_weekday            100000 non-null int64
start_hour               100000 non-null int64
start_time_block         100000 non-null int64
start_date_plus_hour     100000 non-null datetime64[ns]
precip                   99975 non-null object
apparentTemperature      99975 non-null float64
dtypes: datetime64[ns](3), float64(6), int64(4), object(3)
memory usage: 13.0+ MB


In [9]:
# Limit to the columns we are interested in: 
# 'apparentTemperature', 'start_weekday', 'start_hour', (OR 'start_time_block'), 'pickup_community_area' 

columns_to_use = ['apparentTemperature', 'start_weekday', 'start_hour', 'pickup_community_area']
columns_to_drop = [ col for col in df.columns if col not in columns_to_use ]

# Use dependent variables listed above to predict the independent variable: 'trip_total' OR 'fare'
X = df.drop(columns=columns_to_drop)
y = df['trip_total']


# deal with any null values


# Split out continuous & categorical variables
cont_cols = ['apparentTemperature']
cat_cols = [ col for col in columns_to_use if col not in cont_cols ]

enc = OneHotEncoder()

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

X_train_cont = X_train[cont_cols]
X_train_cat = X_train[cat_cols]

# ss = StandardScaler()
# X_train_cont = pd.DataFrame(ss.fit_transform(X_train_cont))
X_train_cont = (X_train_cont - X_train_cont.mean()) / (X_train_cont.max() - X_train_cont.min())

# Transform training set
X_train_enc = enc.fit_transform(X_train_cat, y_train)

# Convert these columns into a DataFrame 
columns = enc.get_feature_names(input_features=X_train_cat.columns)
X_train_cat = pd.DataFrame(X_train_enc.todense(), columns=columns, index=X_train.index)

# Combine categorical and continuous features into the final dataframe
X_train = pd.concat([X_train_cont, X_train_cat], axis=1)
X_train_const = sm.add_constant(X_train)

# Fit model & show summary
model = sm.OLS(y_train,X_train_const).fit()
model.summary()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,trip_total,R-squared:,0.272
Model:,OLS,Adj. R-squared:,0.27
Method:,Least Squares,F-statistic:,260.9
Date:,"Mon, 27 Jan 2020",Prob (F-statistic):,0.0
Time:,21:36:14,Log-Likelihood:,-272010.0
No. Observations:,75000,AIC:,544200.0
Df Residuals:,74892,BIC:,545200.0
Df Model:,107,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.5489,0.067,173.246,0.000,11.418,11.680
apparentTemperature,1.1311,0.242,4.676,0.000,0.657,1.605
start_weekday_0.0,1.7796,0.089,19.999,0.000,1.605,1.954
start_weekday_1.0,1.3410,0.086,15.585,0.000,1.172,1.510
start_weekday_2.0,1.4591,0.086,16.938,0.000,1.290,1.628
start_weekday_3.0,1.9700,0.082,23.998,0.000,1.809,2.131
start_weekday_4.0,1.9243,0.076,25.310,0.000,1.775,2.073
start_weekday_5.0,1.7242,0.077,22.324,0.000,1.573,1.876
start_weekday_6.0,1.3508,0.086,15.713,0.000,1.182,1.519

0,1,2,3
Omnibus:,57364.575,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2877638.123
Skew:,3.234,Prob(JB):,0.0
Kurtosis:,32.648,Cond. No.,1.12e+16


In [10]:
# %run ../python_files/regression

# # Split data into training and test sets
# X_train, X_test, y_train, y_test = get_train_test_split(df, test_size=0.25)
# X_train_const = sm.add_constant(X_train)

# # Fit model & show summary
# OLS(y_train,X_train_const)


In [11]:
# LinearRegression(X_train, y_train)

In [12]:
# Fit the model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Print R2 and MSE for training set
print('Training r^2:', linreg.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train)))

linreg.coef_

Training r^2: 0.2714466230826168
Training MSE: 82.75207685413021


array([ 1.13221475e+00,  2.09079239e+13,  2.09079239e+13,  2.09079239e+13,
        2.09079239e+13,  2.09079239e+13,  2.09079239e+13,  2.09079239e+13,
       -5.40218390e+13, -5.40218390e+13, -5.40218390e+13, -5.40218390e+13,
       -5.40218390e+13, -5.40218390e+13, -5.40218390e+13, -5.40218390e+13,
       -5.40218390e+13, -5.40218390e+13, -5.40218390e+13, -5.40218390e+13,
       -5.40218390e+13, -5.40218390e+13, -5.40218390e+13, -5.40218390e+13,
       -5.40218390e+13, -5.40218390e+13, -5.40218390e+13, -5.40218390e+13,
       -5.40218390e+13, -5.40218390e+13, -5.40218390e+13, -5.40218390e+13,
        1.76005467e+12,  1.76005467e+12,  1.76005467e+12,  1.76005467e+12,
        1.76005467e+12,  1.76005467e+12,  1.76005467e+12,  1.76005467e+12,
        1.76005467e+12,  1.76005467e+12,  1.76005467e+12,  1.76005467e+12,
        1.76005467e+12,  1.76005467e+12,  1.76005467e+12,  1.76005467e+12,
        1.76005467e+12,  1.76005467e+12,  1.76005467e+12,  1.76005467e+12,
        1.76005467e+12,  

In [13]:
# Lasso(X_train, y_train)

In [14]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=1) # Lasso is also known as the L1 norm 
lasso.fit(X_train, y_train)

print('Training r^2:', lasso.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, lasso.predict(X_train)))

lasso.coef_

Training r^2: 0.0
Training MSE: 113.58409620481956


array([ 0.,  0., -0.,  0.,  0.,  0., -0., -0., -0., -0., -0.,  0.,  0.,
        0.,  0.,  0., -0., -0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
       -0., -0., -0., -0., -0., -0.,  0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0.,  0., -0., -0., -0., -0., -0.,  0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        0., -0., -0., -0., -0.,  0.,  0.,  0.,  0.,  0.,  0., -0., -0.,
       -0., -0., -0., -0., -0.,  0., -0., -0., -0., -0., -0.,  0., -0.,
        0., -0.,  0., -0.,  0., -0.])

In [15]:
# Ridge(X_train, y_train)

In [16]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=10) # Ridge is also known as the L2 norm
ridge.fit(X_train, y_train)

print('Training r^2:', ridge.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, ridge.predict(X_train)))

ridge.coef_

Training r^2: 0.27148993085000983
Training MSE: 82.74715778051224


array([ 1.12343526e+00,  1.31070440e-01, -3.08143366e-01, -1.90299750e-01,
        3.21469927e-01,  2.73788485e-01,  7.09189529e-02, -2.98804690e-01,
       -1.47691059e+00, -9.91740706e-01, -7.74148385e-01,  5.55727106e-01,
        3.33171158e+00,  4.63159293e+00,  3.13257967e+00,  4.80731843e-01,
       -1.64429387e-01, -4.19591856e-01, -1.00882449e-01, -6.02758071e-02,
       -2.63923861e-01, -7.82672381e-02, -4.57055459e-01,  2.46762636e-01,
       -4.57935111e-02, -1.74781824e-01, -8.96063571e-01, -1.36232770e+00,
       -1.40748154e+00, -1.37247224e+00, -1.08317259e+00, -1.24978703e+00,
        1.20470165e+01,  3.65202938e-02, -1.42295802e+00, -3.17664935e-01,
       -4.36674878e-01, -4.58906407e-01, -1.19023043e+00, -1.56386854e+00,
       -1.15744027e+00,  5.48771277e-01,  7.52962709e-03, -7.27935503e-01,
        1.46577780e+00, -9.47634800e-01, -6.27503549e-01, -1.33393135e-01,
       -2.64854380e-01,  1.92867458e-01,  1.10861646e+00, -1.18540384e+00,
       -1.07474898e+00, -

In [None]:
plt.figure(figsize=(15,10))
df = df.dropna(subset=['trip_total', 'trip_miles']).drop()

sns.distplot(df['trip_total']/df['trip_miles'])

In [None]:
df.head()