## 3.0 Cluster Analysis

In [13]:
# Standard libraries - run pip install if necessary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

# Geospatial libraries
from h3 import h3 
import geopandas as gp
import folium
from shapely.ops import unary_union
from shapely.geometry.polygon import Polygon
## Color for map 
import branca
import branca.colormap as cm

In [73]:
df = pd.read_csv("data/prepped/prep_taxidata.csv")

In [74]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,fare,...,weekday,month,pickup_community,pickup_area_number,dropoff_community,dropoff_area_number,h3_res7_pickup,h3_res7_dropoff,h3_res8_pickup,h3_res8_dropoff
0,0,4404c6835b9e74e9f74d70f235200a8ce09db14a,7e179f8ef66ae99ec2d1ec89224e0b7ee5469fe5627f6d...,2022-12-31 23:45:00,2023-01-01 00:15:00,2081.0,4.42,,,20.5,...,Saturday,December,WEST RIDGE,2,UPTOWN,3,872664d8effffff,872664d89ffffff,882664d8e1fffff,882664d897fffff
1,1,466473fd2a196ebe92fb2983cb7e8af32e39aa1f,d1d88b89ceb6d753007b6e795e3c24f4bea905a51e9d47...,2022-12-31 23:45:00,2023-01-01 00:00:00,812.0,0.0,,,13.84,...,Saturday,December,NEAR NORTH SIDE,8,WEST TOWN,24,872664c1effffff,872664cacffffff,882664c1edfffff,882664cac3fffff
2,2,3f5cd3f78e5cab455606a31372a95d3204b2fb3f,847cf962bd6f62040673e6c24c24940aeb2d7fdaa54677...,2022-12-31 23:45:00,2023-01-01 00:00:00,600.0,0.9,,,7.0,...,Saturday,December,NEAR NORTH SIDE,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff


**Tasks**: Based on the taxi trip patterns, can you identify clusters of trip types and/or customer types? How would you label these clusters? 

**Methods**: Identify clusters with soft-clustering and visualize your results. Compare your results to a hard-clustering method of your choice. You can use additional features like “distance to city center”, expressive hourly resolutions (e.g., “bar hours”, “morning commuting”), or even land-use/POI data.
Furthermore, can you identify spatial hot spots for trip demand using Gaussian Mixture Models (i.e., using Spatial Kernel Density Estimation)?

Following steps are taken for clustering task:
1. Feature Engineering
2. Data Preparation (Scaling)

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## Helper functions
# Scaling
def scale_df(X):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    return pd.DataFrame(df_scaled, columns = df_scaled.columns, index = df_scaled.index)

# Feature Selection
def feature_selection_kmeans(df, maxvars=3, kmin=2, kmax=8, cut_off=0.5, random_state=1984):
    """
    Perform feature selection using K-means clustering and silhouette score.
    Returns a tuple of the list of selected feature names, the optimal number of clusters and the cluster assignment itself
    """
    kmeans_kwargs = {
        "init": "random",
        "n_init": 20,
        "max_iter": 1000,
        "random_state": random_state
    }

    cols = list(df.columns)
    results_for_each_k = []
    vars_for_each_k = {}

    for k in range(kmin, kmax + 1):
        selected_variables = []
        while len(selected_variables) < maxvars:
            results = []
            for col in cols:
                scols = selected_variables + [col]
                kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
                kmeans.fit(df[scols])
                results.append(silhouette_score(df[scols], kmeans.predict(df[scols])))
            
            selected_var = cols[np.argmax(results)]
            selected_variables.append(selected_var)
            cols.remove(selected_var)
        
        results_for_each_k.append(max(results))
        vars_for_each_k[k] = selected_variables

    best_k = np.argmax(results_for_each_k) + kmin
    selected_variables = vars_for_each_k[best_k]

    kmeans = KMeans(n_clusters=best_k, **kmeans_kwargs)
    kmeans.fit(df[selected_variables])
    clusters = kmeans.predict(df[selected_variables])

    return selected_variables, best_k, clusters


In [75]:
# Should be included in data prep:
# Encode columns
def encode_and_save_mappings(df, columns, output_prefix='data/trips_mapping_'):
    """
    Encode specified columns and save the mappings to CSV files.
    Returns the df with new encoded columns
    """
    for column in columns:
        # Create a new column with the encoded values
        df[f'{column}_encoded'] = pd.factorize(df[column])[0]
        
        # Create and save the mapping
        mapping = df[[column, f'{column}_encoded']].drop_duplicates().sort_values(f'{column}_encoded')
        mapping.to_csv(f'{output_prefix}{column.lower().replace(" ", "_")}.csv', index=False)
        
        print(f"Mapping for {column} saved to {output_prefix}{column.lower().replace(' ', '_')}.csv")
        print(f"Check for correct mapping: \nNumber of unique entries: {len(df[column].unique())} \nNumber of mappings: {len(df[f'{column}_encoded'].unique())}")
    
    return df

columns_to_encode = ['payment_type', 'company', 'taxi_id']
df_trips = encode_and_save_mappings(df, columns_to_encode)

Mapping for payment_type saved to data/trips_mapping_payment_type.csv
Check for correct mapping: 
Number of unique entries: 8 
Number of mappings: 8
Mapping for company saved to data/trips_mapping_company.csv
Check for correct mapping: 
Number of unique entries: 35 
Number of mappings: 35
Mapping for taxi_id saved to data/trips_mapping_taxi_id.csv
Check for correct mapping: 
Number of unique entries: 2883 
Number of mappings: 2883


In [None]:
# Set new index 
df_trips = df_trips.set_index('taxi_id_encoded', drop=False)

df_trips.head()

Unnamed: 0_level_0,level_0,index,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,fare,tips,...,pickup_area_number,dropoff_community,dropoff_area_number,h3_res7_pickup,h3_res7_dropoff,h3_res8_pickup,h3_res8_dropoff,payment_type_encoded,company_encoded,taxi_id_encoded
taxi_id_encoded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,2022-12-31 23:45:00,2023-01-01 00:15:00,2081.0,4.42,,,20.5,0.0,...,2,UPTOWN,3,872664d8effffff,872664d89ffffff,882664d8e1fffff,882664d897fffff,0,0,0
1,1,1,2022-12-31 23:45:00,2023-01-01 00:00:00,812.0,0.0,,,13.84,2.73,...,8,WEST TOWN,24,872664c1effffff,872664cacffffff,882664c1edfffff,882664cac3fffff,1,0,1
2,2,2,2022-12-31 23:45:00,2023-01-01 00:00:00,600.0,0.9,,,7.0,2.0,...,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff,2,1,2
3,3,3,2022-12-31 23:45:00,2023-01-01 00:00:00,546.0,0.85,,,6.5,0.0,...,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff,3,2,3
4,4,4,2022-12-31 23:45:00,2023-01-01 00:00:00,574.0,0.33,,,6.25,0.0,...,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff,3,3,4


In [63]:
df_trips.dtypes

Unnamed: 0                      int64
trip_id                        object
taxi_id                        object
trip_start_timestamp           object
trip_end_timestamp             object
trip_seconds                  float64
trip_miles                    float64
pickup_census_tract           float64
dropoff_census_tract          float64
fare                          float64
tips                          float64
tolls                         float64
extras                        float64
trip_total                    float64
payment_type                   object
company                        object
pickup_centroid_latitude      float64
pickup_centroid_longitude     float64
pickup_centroid_location       object
dropoff_centroid_latitude     float64
dropoff_centroid_longitude    float64
dropoff_centroid_location      object
trip_hours                    float64
hour                            int64
4_hour_window                   int64
6_hour_window                   int64
weekday     

In [64]:
df_trips.columns

Index(['Unnamed: 0', 'trip_id', 'taxi_id', 'trip_start_timestamp',
       'trip_end_timestamp', 'trip_seconds', 'trip_miles',
       'pickup_census_tract', 'dropoff_census_tract', 'fare', 'tips', 'tolls',
       'extras', 'trip_total', 'payment_type', 'company',
       'pickup_centroid_latitude', 'pickup_centroid_longitude',
       'pickup_centroid_location', 'dropoff_centroid_latitude',
       'dropoff_centroid_longitude', 'dropoff_centroid_location', 'trip_hours',
       'hour', '4_hour_window', '6_hour_window', 'weekday', 'month',
       'pickup_community', 'pickup_area_number', 'dropoff_community',
       'dropoff_area_number', 'h3_res7_pickup', 'h3_res7_dropoff',
       'h3_res8_pickup', 'h3_res8_dropoff', 'payment_type_encoded',
       'company_encoded', 'taxi_id_encoded'],
      dtype='object')

In [66]:
# Drop columns that are not numerical for further steps
df_trips = df_trips.drop(["Unnamed: 0", "trip_id", "taxi_id"], axis=1)

In [67]:
df_trips.head(3)


Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,fare,tips,tolls,extras,...,pickup_area_number,dropoff_community,dropoff_area_number,h3_res7_pickup,h3_res7_dropoff,h3_res8_pickup,h3_res8_dropoff,payment_type_encoded,company_encoded,taxi_id_encoded
0,2022-12-31 23:45:00,2023-01-01 00:15:00,2081.0,4.42,,,20.5,0.0,0.0,0.0,...,2,UPTOWN,3,872664d8effffff,872664d89ffffff,882664d8e1fffff,882664d897fffff,0,0,0
1,2022-12-31 23:45:00,2023-01-01 00:00:00,812.0,0.0,,,13.84,2.73,0.0,0.0,...,8,WEST TOWN,24,872664c1effffff,872664cacffffff,882664c1edfffff,882664cac3fffff,1,0,1
2,2022-12-31 23:45:00,2023-01-01 00:00:00,600.0,0.9,,,7.0,2.0,0.0,3.0,...,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff,2,1,2


In [68]:
df_scaled = scale_df(df_trips)

ValueError: could not convert string to float: '2022-12-31 23:45:00'