<a href="https://colab.research.google.com/github/muoyo/chicago-ridesharing/blob/master/notebooks/rideshare_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sodapy

In [16]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
from sodapy import Socrata
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_squared_log_error
%matplotlib inline

In [17]:
def get_random_samples(client, num_samples=200, sample_size=1000, verbose=False):
    
    start = time.time()

    # Perform a $select=count(*) query to determine how large the set is
    results = client.get("m6dm-c72p", select='count(*)' )
    total_rows = int(results[0].get('count', 0))
    row_indices = np.arange(0, total_rows, sample_size)
    results = []

    # Use rand() locally to come up with some offsets
    sample_offsets = np.random.choice(row_indices, size=num_samples, replace=False)


    # Use $limit and $offset in conjunction with a stable $order to pick out individual records. 
    # Ex: $order=facility_id&$limit=1&$offset=<some rand() number>
    for i, offset in enumerate (sample_offsets):

        if verbose:
            print(f'Sample {i}: offset={offset},sample_size={sample_size}')
            print('Pure Python time:', time.time() - start, 'sec.')
        results.extend(client.get("m6dm-c72p", order='trip_id', limit=sample_size, offset=offset, 
                                                  select='''trip_id, trip_start_timestamp, pickup_community_area, fare, tip, trip_total'''))
        
    if verbose:
        print('Pure Python time:', time.time() - start, 'sec.')

    return results

In [18]:
# %run ../python_files/utils
# %run ../python_files/data_cleaning

# df = get_trip_records(100000)
# df = clean_data(df)
# df

In [19]:
# samples = get_random_samples(client, verbose=True )
# samples[:10]

In [20]:
# samples_df = pd.DataFrame.from_records(samples)
# samples_df

In [None]:
client = Socrata('data.cityofchicago.org',
             'Tk6RhuGAFvF9P4ehsysybj3IW',
             username="mokome@gmail.com",
             password="Ch1cago!!")

client.timeout = 10000

results = client.get("m6dm-c72p", limit=1000000, select='trip_id, trip_start_timestamp, trip_end_timestamp, trip_seconds, trip_miles, pickup_community_area, dropoff_community_area, fare, tip, additional_charges, trip_total' )

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
results_df

In [None]:
columns_to_use = ['trip_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds',
       'trip_miles', 'pickup_community_area', 'fare', 'tip',
       'additional_charges', 'trip_total' ]

columns_to_drop = [ col for col in results_df.columns if col not in columns_to_use ]
df = results_df.drop(columns=columns_to_drop)

df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])
df['trip_seconds'] = df['trip_seconds'].fillna('0')
df['trip_seconds'] = df['trip_seconds'].astype('int64')

for col in ['trip_miles', 'fare', 'tip', 'additional_charges', 'trip_total']:
    df[col] = df[col].astype(float) 

df['start_weekday'] = df['trip_start_timestamp'].apply(lambda d: d.weekday())
df['start_hour'] = df['trip_start_timestamp'].apply(lambda d: d.hour)
df['start_time_block'] = df['start_hour'] // 3

df['start_date_plus_hour'] = df['trip_start_timestamp'].apply(lambda d: datetime(d.year, d.month, d.day, d.hour))

weather_df = pd.read_csv('../data/chicago_weather.csv')
weather_df['hour'] = weather_df['hour'].apply(lambda x: '{:02d}'.format(x))
weather_df['start_date_plus_hour'] = pd.to_datetime(weather_df['date'] + ' ' + weather_df['hour'] + ':00:00')
weather_df = weather_df.rename(columns={'icon': 'precip'})

# def set_precip(precip):
#     if precip not in ['rain', 'snow']: 
#         precip = 'clear'

#     return precip

# weather_df['precip'] = weather_df['precip'].apply(set_precip)
precip_df = weather_df[['start_date_plus_hour', 'precip', 'apparentTemperature']]

df = df.merge(precip_df, how='left', on='start_date_plus_hour')
df.head()

In [None]:
df.head()
df.info()

In [None]:
# Limit to the columns we are interested in: 
# 'apparentTemperature', 'start_weekday', 'start_hour', (OR 'start_time_block'), 'pickup_community_area' 

columns_to_use = ['apparentTemperature', 'start_weekday', 'start_hour', 'pickup_community_area']
columns_to_drop = [ col for col in df.columns if col not in columns_to_use ]

# Use dependent variables listed above to predict the independent variable: 'trip_total' OR 'fare'
X = df.drop(columns=columns_to_drop)
y = df['trip_total']


# deal with any null values
X['apparentTemperature']=X['apparentTemperature'].fillna(X['apparentTemperature'].median())
X['pickup_community_area']=X['pickup_community_area'].fillna('0')


# Split out continuous & categorical variables
cont_cols = ['apparentTemperature']
cat_cols = [ col for col in columns_to_use if col not in cont_cols ]

enc = OneHotEncoder()

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

X_train_cont = X_train[cont_cols]
X_train_cat = X_train[cat_cols]

# ss = StandardScaler()
# X_train_cont = pd.DataFrame(ss.fit_transform(X_train_cont))
X_train_cont = (X_train_cont - X_train_cont.mean()) / (X_train_cont.max() - X_train_cont.min())

# Transform training set
X_train_enc = enc.fit_transform(X_train_cat, y_train)

# Convert these columns into a DataFrame 
columns = enc.get_feature_names(input_features=X_train_cat.columns)
X_train_cat = pd.DataFrame(X_train_enc.todense(), columns=columns, index=X_train.index)

# Combine categorical and continuous features into the final dataframe
X_train = pd.concat([X_train_cont, X_train_cat], axis=1)
X_train_const = sm.add_constant(X_train)

# Fit model & show summary
model = sm.OLS(y_train,X_train_const).fit()
model.summary()

In [None]:
# %run ../python_files/regression

# # Split data into training and test sets
# X_train, X_test, y_train, y_test = get_train_test_split(df, test_size=0.25)
# X_train_const = sm.add_constant(X_train)

# # Fit model & show summary
# OLS(y_train,X_train_const)


In [None]:
# LinearRegression(X_train, y_train)

In [None]:
# Fit the model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Print R2 and MSE for training set
print('Training r^2:', linreg.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train)))

linreg.coef_

In [None]:
# Lasso(X_train, y_train)

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=1) # Lasso is also known as the L1 norm 
lasso.fit(X_train, y_train)

print('Training r^2:', lasso.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, lasso.predict(X_train)))

lasso.coef_

In [None]:
# Ridge(X_train, y_train)

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=10) # Ridge is also known as the L2 norm
ridge.fit(X_train, y_train)

print('Training r^2:', ridge.score(X_train, y_train))
print('Training MSE:', mean_squared_error(y_train, ridge.predict(X_train)))

ridge.coef_