In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [2]:
# Function to calculate entropy
def entropy(y):
    counter = Counter(y)
    total = len(y)
    return -sum((count / total) * np.log2(count / total) for count in counter.values())

# Function to calculate information gain
def information_gain(X, y, feature):
    total_entropy = entropy(y)
    values, counts = np.unique(X[:, feature], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(y[X[:, feature] == values[i]]) for i in range(len(values)))
    return total_entropy - weighted_entropy

# Function to find the best feature to split on
def best_feature_to_split(X, y):
    gains = [information_gain(X, y, feature) for feature in range(X.shape[1])]
    return np.argmax(gains)

# ID3 Algorithm to build the decision tree
def id3(X, y, features):
    if len(np.unique(y)) == 1:
        return np.unique(y)[0]
    if len(features) == 0:
        return Counter(y).most_common(1)[0][0]
    
    best_feature = best_feature_to_split(X, y)
    tree = {best_feature: {}}
    
    unique_values = np.unique(X[:, best_feature])
    for value in unique_values:
        sub_X = X[X[:, best_feature] == value]
        sub_y = y[X[:, best_feature] == value]
        sub_features = [i for i in features if i != best_feature]
        subtree = id3(sub_X, sub_y, sub_features)
        tree[best_feature][value] = subtree
        
    return tree

# Function to predict using the decision tree
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    feature = list(tree.keys())[0]
    value = sample[feature]
    if value in tree[feature]:
        return predict(tree[feature][value], sample)
    else:
        return Counter(tree[feature].values()).most_common(1)[0][0]

# Function to predict for the entire dataset
def predict_all(tree, X):
    return [predict(tree, sample) for sample in X]


In [3]:
data = pd.read_csv('C:/Users/marya/OneDrive/Desktop/Classification_project1/preprocessed_green_tripdata_2015-07.csv')

# Discretize the trip price into three categories: low, medium, high
price_quantiles = data['total_amount'].quantile([0.33, 0.67]).values
data['price_category'] = pd.cut(data['total_amount'], bins=[-np.inf, price_quantiles[0], price_quantiles[1], np.inf], labels=[0, 1, 2])

# Select only numeric features
numeric_features = data.select_dtypes(include=[np.number]).drop(['total_amount'], axis=1)

# Handle missing values by filling them with the mean of the column
numeric_features = numeric_features.fillna(numeric_features.mean())

# Scale features and apply PCA for dimensionality reduction
scaler = StandardScaler()
X_scaled = scaler.fit_transform(numeric_features)

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, data['price_category'], test_size=0.2, stratify=data['price_category'], random_state=42)


In [4]:
# Train the decision tree
features = list(range(X_train.shape[1]))
tree = id3(X_train, y_train, features)

In [5]:
# Predict on the test data
y_pred = predict_all(tree, X_test)

In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro')
#f1 = f1_score(y_test, y_pred, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("Decision Tree Testing Accuracy:", accuracy)
print("Decision Tree Testing Precision:", precision)
print("Decision Tree Testing Recall:", recall)
#print("Decision Tree Testing F1 Score:", f1)

print("Confusion Matrix:")
print(conf_matrix)
print("True Low: ", conf_matrix[0, 0], "False Medium: ", conf_matrix[0, 1], "False High: ", conf_matrix[0, 2])
print("False Low: ", conf_matrix[1, 0], "True Medium: ", conf_matrix[1, 1], "False High: ", conf_matrix[1, 2])
print("False Low: ", conf_matrix[2, 0], "False Medium: ", conf_matrix[2, 1], "True High: ", conf_matrix[2, 2])

Decision Tree Testing Accuracy: 0.3535299461423658
Decision Tree Testing Precision: 0.7844470407511981
Decision Tree Testing Recall: 0.3336291659679325
Confusion Matrix:
[[21775     0     0]
 [19541     1     0]
 [20310     0    17]]
True Low:  21775 False Medium:  0 False High:  0
False Low:  19541 True Medium:  1 False High:  0
False Low:  20310 False Medium:  0 True High:  17
