In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import warnings
import ast

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.model_selection import (
    train_test_split, KFold, RandomizedSearchCV, cross_val_score)
from sklearn.metrics import (r2_score, mean_squared_error, make_scorer)
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder, LabelEncoder
)
from sklearn.model_selection import ParameterSampler

import itertools
import random

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
warnings.filterwarnings('ignore')

# Feature Engineering

Import the worker dataset

In [2]:
# Import the dataset
df_workers = pd.read_csv("worker_data.csv")


The function `categorize_mode()` bins together transportation modes based on their significance in the data dictionary.

In [3]:
def categorize_mode(mode):
    if mode in [1,2, 3, 4]:  # Walk, bike, wheelchair/mobility scooter, other non-motorized
        return 'Active Transportation'
    elif mode in [5,8,10]:  # Auto Driver, motorcycle, rental car
        return 'Auto Driver'
    elif mode in [6,7,9]:  # Auto Passenger, Carpool, taxi
        return 'Auto Passenger'
    elif mode in [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]:  # Transit modes
        return 'Transit'
    else:  # Everything else
        return 'Other'

The function `temporal_conversion()` converts the date and departure time into datetime.

In [4]:
def temporal_conversion(data):
    # Combine travel_date and dep_time into a single datetime column
    data['departure_datetime'] = pd.to_datetime(data['travel_date'] + ' ' + data['dep_time'])

    # Drop the original travel_date and dep_time columns if no longer needed
    data = data.drop(columns=['travel_date', 'dep_time']) 

    # Ensure 'departure_datetime' is in datetime format
    data['departure_datetime'] = pd.to_datetime(data['departure_datetime'])

    # Create a new dat of week column with numerical days of week
    data['day_of_week_num'] = data['departure_datetime'].dt.dayofweek

    return data

The `workday_range` in the dataset is a list in string form. Hence,  `convert_list_to_int_or_nan()` checks the contents of the converted list and assign the values as integers used for workday analysis.

In [5]:
# Function to handle 'nan' and numbers in a string list
def convert_list_to_int_or_nan(x):
    if isinstance(x, str):
        # Convert string to list using ast.literal_eval
        x = ast.literal_eval(x)

    # Now process the list, convert numeric strings to int and 'nan' to np.nan
    result = []
    for val in x:
        if isinstance(val, str):
            # If it's a string number (e.g., '5.0', '6.0'), convert to float or int
            if val.lower() != 'nan':
                result.append(int(float(val)))  # Convert numeric string to an integer
            else:
                result.append(np.nan)  # Replace 'nan' string with np.nan
        elif isinstance(val, (float, int)):
            result.append(int(val))  # Convert numeric types to integer
        else:
            result.append(np.nan)  # Handle non-numeric values

    return result

The function `check_workday()` compares the day of the week of the travel time to the days the individual travels to determine whether a trip can be assigned as a worktrip or not.

In [6]:
def check_workday(travel_date, workday_range):
    # If the person works Monday to Friday (8)
    if 8 in workday_range:
        if travel_date == 5 or travel_date == 6:  # Saturday (5) or Sunday (6)
            return 2  # Not a workday
        else:
            return 1  # Monday to Friday is a workday
    # If the person works on specific days in workday_range (1-7)
    elif (travel_date + 1) in workday_range:  # Convert travel_date 0-6 to 1-7 (Monday=1, Sunday=7)
        return 1  # It is a workday
    else:
        return 2  # Not a workday

`feature_engineering()` includes all the feature engineering performed to transform the given features into ones that would help develop the model and improve it's accuracy. Not all of these avriables will be use in the final model. This will depend on their performance in the model at predicting rider counts.

In [7]:
def feature_engineering(data):
    data_copy = data.copy()

    # Apply the categorization function to the 'Mode of transport' column
    data_copy['mode_category'] = data_copy['mode_category'].astype(int).apply(categorize_mode)

    data_copy = temporal_conversion(data_copy)

    # First convert the units column into an actual list
    data_copy.loc[:, 'workday_range'] = data_copy['workday_range'].str.strip("[]").str.replace("'", "").str.split(", ")

    # Apply this function to the 'workday_range' column
    data_copy.loc[:, 'workday_range'] = data_copy['workday_range'].apply(convert_list_to_int_or_nan)

    # Apply the function to the DataFrame
    data_copy['is_workday'] = data_copy.apply(lambda row: check_workday(row['day_of_week_num'], row['workday_range']), axis=1)


    return data_copy

Apply the features to the dataset.

In [8]:
workers_features = feature_engineering(df_workers)

In [9]:
workers_features

Unnamed: 0,tottr,hhmem,mode_category,duration_min,trip_distance_miles,arr_time,city_from_zip,gender,age,citizen,...,body_type,fuel_type1,purchase_type,ownership,transmission,cylinders,veh_type,departure_datetime,day_of_week_num,is_workday
0,3.0,2.0,Auto Driver,20.0,3.619057,08:35:00,Greater Los Angeles Area,1.0,46.0,1.0,...,8.0,1.0,1.0,1.0,1.0,4.0,2.0,2012-05-12 08:15:00,5,2
1,5.0,4.0,Auto Driver,20.0,3.587950,15:50:00,Greater Los Angeles Area,1.0,46.0,1.0,...,8.0,1.0,1.0,1.0,1.0,4.0,2.0,2012-05-12 15:30:00,5,2
2,2.0,1.0,Auto Driver,25.0,19.351621,16:55:00,Greater Los Angeles Area,1.0,46.0,1.0,...,8.0,1.0,1.0,1.0,1.0,4.0,2.0,2012-05-12 16:30:00,5,2
3,2.0,1.0,Auto Driver,15.0,6.451126,17:55:00,Greater Los Angeles Area,1.0,46.0,1.0,...,8.0,1.0,1.0,1.0,1.0,4.0,2.0,2012-05-12 17:40:00,5,2
4,2.0,1.0,Auto Driver,20.0,17.657172,19:00:00,Greater Los Angeles Area,1.0,46.0,1.0,...,8.0,1.0,1.0,1.0,1.0,4.0,2.0,2012-05-12 18:40:00,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95112,1.0,0.0,Auto Driver,4.0,0.181507,16:02:00,San Francisco Bay Area,1.0,45.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013-01-30 15:58:00,2,1
95113,1.0,0.0,Auto Driver,14.0,2.674467,16:47:00,San Francisco Bay Area,1.0,45.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013-01-30 16:33:00,2,1
95114,1.0,0.0,Active Transportation,17.0,2.621383,17:25:00,San Francisco Bay Area,1.0,45.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013-01-30 17:08:00,2,1
95115,1.0,0.0,Active Transportation,94.0,10.356760,19:17:00,San Francisco Bay Area,1.0,45.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013-01-30 17:43:00,2,1


# Feature Selection

In [None]:
def selected_feature(data):
    
    feature_selected = ['tottr',
                        'gender',
                        'race',
                        'age',
                        'citizen',
                        'incentive',
                        'vehicle_count',
                        'income',
                        'worker_count',
                        'is_workday',
                        'workday_count',
                        'driver_license',
                        'job_count',
                        'apparent_temperature_mean (°C)',
                        'WMO_code']

    selected_data = data[feature_selected]
    
    return selected_data

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

# Assume df is your DataFrame and 'y' is the target column

# Step 1: Split the data into train (70%) and temp (30%)
train_df, temp_df = train_test_split(df_filtered, test_size=0.3, stratify=df_filtered['mode_category'], random_state=42)

# Step 2: Split the temp set into validation (15%) and test (15%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['mode_category'], random_state=42)

# Step 3: Separate features (X) and target (y) for the training set
X_train = selected_feature(train_df)
y_train = train_df['mode_category']

# Step 4: Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 5: Print class distribution after SMOTE
print("Class distribution before SMOTE:", Counter(y_train))
print("Class distribution after SMOTE:", Counter(y_train_resampled))

# Step 6: Separate features and target for validation and test sets
X_val = selected_feature(val_df)
y_val = val_df['mode_category']

X_test = selected_feature(test_df)
y_test = test_df['mode_category']

# Outputs
print(f"Training set size: {X_train_resampled.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


In [None]:
model = RandomForestClassifier(random_state=42)

# Fit and predict
model.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred = model.predict(X_val)

# Evaluate Performace of the Model

In [None]:
# accuracy = accuracy_score(y_val, y_pred)

# print(f"Accuracy of the Random Forest Classifier: {accuracy * 100:.2f}%")

In [None]:
# from sklearn.model_selection import cross_val_score

# # Perform cross-validation
# cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5)

# print(f"Cross-Validated Accuracy: {cv_scores.mean() * 100:.2f}%")

In [None]:
from sklearn.metrics import classification_report, f1_score

# Step 1: Check class distribution
unique, counts = np.unique(y_val, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class Distribution:", class_distribution)

# Step 2: Generate classification report
report = classification_report(y_val, y_pred, target_names=['Active Transportation', 'Auto Driver', 'Auto Passenger', 'Transit'], digits=3)
print("\nClassification Report:\n", report)

# Step 3: Calculate weighted F1-score
weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print("\nWeighted F1-Score:", weighted_f1)

In [None]:
# Feature importances
importances = model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 features
print("\nTop 10 Important Features:")
print(feature_importance_df.head(10))

# Plot the top 10 feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10))
plt.title('Top 10 Feature Importances')
plt.tight_layout()
plt.show()

In [None]:
model.get_params()