# Lasso Regression

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np

### Import Datasets and Merge

In [2]:
# Random 0.003% sample of entire 2019 dataset
df_003 = pd.read_csv("data/rideshare_003_clean.csv", compression="gzip", index_col=0)

In [None]:
# something happens to pickup_census_tract and dropoff_census_tract when read in...

In [3]:
df_003.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 287763 entries, 11025760 to 85528272
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   trip_id                     287763 non-null  object 
 1   trip_start_timestamp        287763 non-null  object 
 2   trip_end_timestamp          287763 non-null  object 
 3   trip_seconds                287763 non-null  float64
 4   trip_miles                  287763 non-null  float64
 5   pickup_census_tract         223044 non-null  float64
 6   dropoff_census_tract        223044 non-null  float64
 7   pickup_community_area       287532 non-null  float64
 8   dropoff_community_area      287515 non-null  float64
 9   fare                        287763 non-null  float64
 10  tip                         287763 non-null  float64
 11  additional_charges          287763 non-null  float64
 12  trip_total                  287763 non-null  float64
 13  share

In [3]:
# join on pickup lat/long
cmap = gpd.read_file("data/community_area/com_cmap.shp")
gdf_pickup = gpd.GeoDataFrame(df_003, geometry=gpd.points_from_xy(df_003["pickup_centroid_longitude"], 
                                                       df_003["pickup_centroid_latitude"]), crs="EPSG:4326")
gdf_cmap = gpd.sjoin(gdf_pickup, cmap, op='within')

  if (await self.run_code(code, result,  async_=asy)):


In [4]:
# rename columns 28-67 to specify _pickup
gdf_cmap = gdf_cmap.rename(mapper={col:col+ "_pickup" for col in list(gdf_cmap.columns[28:])}, axis=1)

In [5]:
# join on dropoff lat/long
gdf_dropoff = gpd.GeoDataFrame(gdf_cmap, geometry=gpd.points_from_xy(gdf_cmap["dropoff_centroid_longitude"], 
                                                       gdf_cmap["dropoff_centroid_latitude"]), crs="EPSG:4326")
gdf_cmap = gpd.sjoin(gdf_dropoff, cmap, op='within')

  if (await self.run_code(code, result,  async_=asy)):


In [6]:
# rename columns 68-108 to specify _pickup
gdf_cmap = gdf_cmap.rename(mapper={col:col+ "_dropoff" for col in list(gdf_cmap.columns[68:])}, axis=1)

In [9]:
gdf_cmap.shape

(287727, 108)

### Drop non-numeric columns

In [7]:
# drop columns
gdf_cmap_model = gdf_cmap.drop(["trip_id", "index_right_pickup", "index_right_dropoff"], axis=1)

In [8]:
# ,"Pickup Census Tract", "Dropoff Census Tract",
non_num_cols = ["trip_start_timestamp",
                "trip_end_timestamp",
                "geometry_pickup",
                'geometry_dropoff',
                'pickup_centroid_location',
                'dropoff_centroid_location',
                'date',
                'geog_pickup',
                'geog_dropoff',
                'pickup_census_tract', # these two have nas for some reason
                'dropoff_census_tract',
                "pickup_community_area",
                "dropoff_community_area"]

In [9]:
gdf_cmap_model = gdf_cmap_model.astype({'area_num_pickup' : int, 'area_num_dropoff' : int})                          

In [10]:
# drop non-numeric cols
gdf_cmap_model = gdf_cmap_model.drop(columns=non_num_cols)

## Split Train Test

After merging our 2 datasets, the first thing we will do is split into training and test sets. We will not touch the test set until we make final predictions.

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# split df into features and labels
X = gdf_cmap_model.drop(columns=["shared_trip_auth_num", "trips_pooled", 'shared_trip_authorized'])
y = pd.DataFrame(gdf_cmap_model["shared_trip_auth_num"])

# split df into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

# df that's just training
df_train = pd.concat([X_train, y_train], axis=1)

In [90]:
X_train.shape

(201408, 89)

Our training dataset has 89 numeric columns. We will use these to predict the column shared_trip_auth_num.

## Lasso Regression  - All Features

Make a simple Pipeline

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [14]:
pipeline = Pipeline([('scaler', StandardScaler()),
                    ('model', Lasso())])

Cross Validation

In [16]:
param_grid = {'model__alpha': np.arange(0.1, 10.1, 0.2)}

In [17]:
# play with different ways of measuring error
# explain what happens when we change alpha

In [18]:
scoring_metrics = ["neg_mean_squared_error", "r2", "neg_mean_absolute_error"]

In [19]:
grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring_metrics, refit = "r2")
grid_search.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': array([0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5,
       2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9, 5.1,
       5.3, 5.5, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.1, 7.3, 7.5, 7.7,
       7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9])},
             refit='r2',
             scoring=['neg_mean_squared_error', 'r2',
                      'neg_mean_absolute_error'])

In [20]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('model', Lasso(alpha=0.1))])

Using the best model

In [21]:
lasso_reg = grid_search.best_estimator_

In [22]:
lasso_reg.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('model', Lasso(alpha=0.1))])

In [23]:
y_pred = lasso_reg.predict(X_test)

In [24]:
# need to get categorical label from numeric
y_pred = np.sign(y_pred)

In [25]:
err = mean_squared_error(np.array(y_test), y_pred)

In [26]:
err

0.7467649069150477

In [27]:
lasso_reg.n_features_in_

89

In [28]:
features = np.array(X_train.columns)

In [29]:
coefficients = lasso_reg[1].coef_

In [49]:
feat_coeffs = pd.DataFrame(data=features[(coefficients > 0.001) | (coefficients < -0.001)], columns=["feature"])

In [47]:
coefficients[(coefficients<-0.001)|(coefficients>0.001)]

array([ 0.05066144, -0.21261021, -0.04964191, -0.00818692, -0.02866947,
       -0.00348549, -0.04047664])

In [50]:
feat_coeffs['coeff'] = coefficients[(coefficients<-0.001)|(coefficients>0.001)]

In [51]:
feat_coeffs.sort_values(by='coeff', ascending=False)

Unnamed: 0,feature,coeff
0,trip_seconds,0.050661
5,perc_emp_dropoff,-0.003485
3,perc_emp_pickup,-0.008187
4,perc_bach_pickup,-0.028669
6,perc_bach_dropoff,-0.040477
2,trip_total_per_mile,-0.049642
1,additional_charges,-0.21261


In [52]:
importance = np.abs(coefficients)

In [53]:
feat_imp = pd.DataFrame(data=features[importance > 0.001], columns=["feature"])

In [54]:
feat_imp['importance'] = importance[importance > 0.001]

In [55]:
feat_imp.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
1,additional_charges,0.21261
0,trip_seconds,0.050661
2,trip_total_per_mile,0.049642
6,perc_bach_dropoff,0.040477
4,perc_bach_pickup,0.028669
3,perc_emp_pickup,0.008187
5,perc_emp_dropoff,0.003485


In [158]:
lasso_reg[1].intercept_

array([-0.61874404])

## Lasso Regression  - Hand Picked Features

Out of the 89 features, many are related to one another (some are simple duplicates with a different scale, for example hours vs minutes). I will now run that same Lasso Regression from above, but with a hand-picked selection of unique features.

In [56]:
X_new = gdf_cmap_model.drop(columns=['trip_seconds',
                                 'trip_miles',
                                 'fare',
                                 'tip',
                                 'additional_charges',
                                 'trip_total',
                                 'shared_trip_authorized', 'trips_pooled',
                                 'pickup_centroid_latitude',
                                 'pickup_centroid_longitude',
                                 'dropoff_centroid_latitude',
                                 'dropoff_centroid_longitude', 'trip_start_month',
                                 "shared_trip_auth_num"])

In [57]:
# split df into features and labels
y_new = pd.DataFrame(gdf_cmap_model["shared_trip_auth_num"])

# split df into train and test
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, random_state=0, test_size=0.3)

In [161]:
X_train_new.shape

(201408, 78)

Cross Validation

In [58]:
grid_search.fit(X_train_new, y_train_new)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': array([0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5,
       2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9, 5.1,
       5.3, 5.5, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.1, 7.3, 7.5, 7.7,
       7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9])},
             refit='r2',
             scoring=['neg_mean_squared_error', 'r2',
                      'neg_mean_absolute_error'])

In [59]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('model', Lasso(alpha=0.1))])

In [60]:
lasso_reg_new = grid_search.best_estimator_

Using the best model

In [61]:
lasso_reg_new.fit(X_train_new, y_train_new)

Pipeline(steps=[('scaler', StandardScaler()), ('model', Lasso(alpha=0.1))])

In [62]:
y_pred_new = lasso_reg_new.predict(X_test_new)

In [63]:
err_new = mean_squared_error(np.array(y_test_new), y_pred_new)

In [64]:
err_new

0.5680535611925737

In [65]:
features_new = np.array(X_train_new.columns)

In [66]:
coefficients_new = lasso_reg_new[1].coef_

In [67]:
feat_coeff_new = pd.DataFrame(data=features_new[(coefficients_new > 0.001)|(coefficients_new < -0.001)], columns=["feature"])

In [70]:
feat_coeff_new['coeff'] = coefficients_new[(coefficients_new > 0.001) | (coefficients_new < -0.001)]

In [71]:
feat_coeff_new.sort_values(by='coeff', ascending=False)

Unnamed: 0,feature,coeff
2,perc_tot_c_pickup,-0.00449
1,perc_white_pickup,-0.040187
3,perc_white_dropoff,-0.054321
0,trip_total_per_mile,-0.064994


In [72]:
importance_new = np.abs(coefficients_new)

In [73]:
feat_imp_new = pd.DataFrame(data=features_new[importance_new > 0.001], columns=["feature"])

In [74]:
feat_imp_new['importance'] = importance_new[importance_new > 0.001]

In [75]:
feat_imp_new.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
0,trip_total_per_mile,0.064994
3,perc_white_dropoff,0.054321
1,perc_white_pickup,0.040187
2,perc_tot_c_pickup,0.00449


In [176]:
lasso_reg_new[1].intercept_

array([-0.61874404])