In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_parquet("data/raw_data/yellow.parquet")

# Take a small sample of the data
df = df.sample(frac=0.0004)

print('num data points:', len(df))

# Get the features and target
y = df['DOLocationID']
X = df.drop(columns=['DOLocationID'])

# Drop some columns from X
X = X.drop(columns=['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag', 'payment_type'])

# handle missing values
X = X.fillna(X.median())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('num data points in X: ', len(X))
print('num data points in y: ', len(y))

num data points: 24688
num data points in X:  24688
num data points in y:  24688


In [14]:
rf = RandomForestClassifier(oob_score=True, warm_start = False)

# Create a parameter grid
param_grid = {"max_depth": [30,45], 
              "max_features": ['sqrt', 'log2'],
              "n_estimators": [100,200,300]}

bestOOB = 0
# Fit the grid search to the classifier and the parameter grid
for params in ParameterGrid(param_grid):
    rf.set_params(**params)
    rf.fit(X,y)

    curOOB = rf.oob_score_

    if curOOB > bestOOB:
        bestOOB = curOOB
        best_rf = rf

print("Best OOB Score:",bestOOB)

Best OOB Score: 0.11831659105638367


In [3]:
# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of KNN: {accuracy:.2f}")

# Create an SVM model
xgb  = GradientBoostingClassifier()

# Train the model
xgb.fit(X_train, y_train)

# Make predictions
y_pred = xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"XGB Accuracy: {accuracy:.2f}")

Accuracy of KNN: 0.10
SVM Accuracy: 0.01


In [15]:
indicies = best_rf.feature_importances_.argsort()
largest = X.columns[indicies][-3:]
smallest = X.columns[indicies][:3]

print("(ii): Most important: ", [i for i in largest])
print("      Least important: ", [i for i in smallest])

(ii): Most important:  ['total_amount', 'PULocationID', 'trip_distance']
      Least important:  ['improvement_surcharge', 'airport_fee', 'mta_tax']
