In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

data = pd.read_csv("C:/Users/marya/OneDrive/Desktop/Classification_project1/preprocessed_green_tripdata_2015-07.csv")

# Display the first few rows to understand the structure
print(data.head())

# Identify and drop non-numeric columns, or convert them if necessary
# For example, drop datetime columns or encode categorical columns
# Assuming the last column is the target and all others are features
X = data.iloc[:, :-1].select_dtypes(include=[np.number])  # Select only numeric columns
y = data.iloc[:, -1]

# Handle missing values by imputing with the mean of the column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Encode the target variable if it is categorical
if y.dtype == 'O':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Scale features and apply PCA for dimensionality reduction
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_scaled)

# If imbalanced-learn is not available, proceed without resampling
X = pd.DataFrame(X_pca)
y = pd.Series(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=10)


  lpep_pickup_datetime lpep_dropoff_datetime  store_and_fwd_flag  \
0  2015-07-01 00:12:55   2015-07-01 00:16:53                   0   
1  2015-07-01 00:38:35   2015-07-01 00:49:22                   0   
2  2015-07-01 00:59:01   2015-07-01 01:04:10                   0   
3  2015-07-01 00:48:41   2015-07-01 00:54:24                   0   
4  2015-07-01 00:56:28   2015-07-01 01:07:45                   0   

                             pu_location  \
0     Brooklyn,Williamsburg (South Side)   
1     Brooklyn,Williamsburg (South Side)   
2                Brooklyn,Bushwick North   
3                   Brooklyn,Brownsville   
4  Brooklyn,East Flatbush/Remsen Village   

                             do_location  passenger_count  trip_distance  \
0                Brooklyn,Bushwick South              1.0           0.88   
1                       Queens,Ridgewood              1.0           3.00   
2             Brooklyn,East Williamsburg              1.0           1.61   
3  Brooklyn,East Flatb

In [2]:
# Define parameter grid for KNN
params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Cross-validation setup
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
best_accuracy = 0
best_model = None

# Perform grid search with cross-validation
for k in params['n_neighbors']:
    for weight in params['weights']:
        for metric in params['metric']:
            cv_accuracies = []
            for train_idx, val_idx in kfold.split(X_train, y_train):
                model = KNeighborsClassifier(n_neighbors=k, weights=weight, metric=metric)
                model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
                y_val_pred = model.predict(X_train.iloc[val_idx])
                accuracy = accuracy_score(y_train.iloc[val_idx], y_val_pred)
                cv_accuracies.append(accuracy)
            avg_accuracy = np.mean(cv_accuracies)
            if avg_accuracy > best_accuracy:
                best_accuracy = avg_accuracy
                best_model = {'k': k, 'weight': weight, 'metric': metric}


In [3]:
# Train the best model on the entire training set
final_model = KNeighborsClassifier(n_neighbors=best_model['k'], weights=best_model['weight'], metric=best_model['metric'])
final_model.fit(X_train, y_train)

In [4]:
# Evaluate the model on the test set
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])

# Print the evaluation metrics
print("Testing Accuracy:", accuracy)
print("Testing Precision:", precision)
print("Testing Recall:", recall)
print("Confusion Matrix:")
print(conf_matrix)
print("True Low: ", conf_matrix[0, 0], "False Medium: ", conf_matrix[0, 1], "False High: ", conf_matrix[0, 2])
print("False Low: ", conf_matrix[1, 0], "True Medium: ", conf_matrix[1, 1], "False High: ", conf_matrix[1, 2])
print("False Low: ", conf_matrix[2, 0], "False Medium: ", conf_matrix[2, 1], "True High: ", conf_matrix[2, 2])

Testing Accuracy: 0.7477343023381567
Testing Precision: 0.7409897832207132
Testing Recall: 0.7398702788026204
Confusion Matrix:
[[21172  5676     0]
 [ 5987 13398     0]
 [    0     0     0]]
True Low:  21172 False Medium:  5676 False High:  0
False Low:  5987 True Medium:  13398 False High:  0
False Low:  0 False Medium:  0 True High:  0


In [7]:
class_labels = {0: 'Low', 1: 'Medium', 2: 'High'}
results = []

for class_label in class_labels.keys():
    y_test_binary = (y_test == class_label).astype(int)
    y_pred_binary = (y_pred == class_label).astype(int)
    
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    precision = precision_score(y_test_binary, y_pred_binary, zero_division=0)
    recall = recall_score(y_test_binary, y_pred_binary, zero_division=0)
   # f1 = f1_score(y_test_binary, y_pred_binary, zero_division=0)
    
    results.append((class_labels[class_label], accuracy, precision, recall))

results_df = pd.DataFrame(results, columns=['Class', 'Accuracy', 'Precision', 'Recall'])
print(results_df)

    Class  Accuracy  Precision    Recall
0     Low  0.747734   0.779557  0.788588
1  Medium  0.747734   0.702422  0.691153
2    High  1.000000   0.000000  0.000000
