In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data
data = pd.read_csv('C:/Users/marya/OneDrive/Desktop/Classification_project1/preprocessed_green_tripdata_2015-07.csv')

# Discretize the trip price into three categories: low, medium, high
price_quantiles = data['total_amount'].quantile([0.33, 0.67]).values
data['price_category'] = pd.cut(data['total_amount'], bins=[-np.inf, price_quantiles[0], price_quantiles[1], np.inf], labels=[0, 1, 2])

# Select only numeric features
numeric_features = data.select_dtypes(include=[np.number]).drop(['total_amount'], axis=1)

# Handle missing values by filling them with the mean of the column
numeric_features = numeric_features.fillna(numeric_features.mean())

# Standardize the features
scaler = StandardScaler()
features = scaler.fit_transform(numeric_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, data['price_category'], test_size=0.2, random_state=42)


In [10]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        self.m, self.n = X.shape
        self.weights = np.zeros(self.n)
        self.bias = 0
        self.y = y
        
        # Gradient descent
        for _ in range(self.iterations):
            model = np.dot(X, self.weights) + self.bias
            predictions = self.sigmoid(model)
            
            dw = (1 / self.m) * np.dot(X.T, (predictions - y))
            db = (1 / self.m) * np.sum(predictions - y)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        model = np.dot(X, self.weights) + self.bias
        predictions = self.sigmoid(model)
        return [1 if i > 0.5 else 0 for i in predictions]
    
    def predict_proba(self, X):
        model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(model)


In [11]:
# Convert target variable to binary for logistic regression
# We will do one-vs-rest classification for the multi-class problem

def one_vs_rest_labels(y, target_class):
    return np.where(y == target_class, 1, 0)

# Initialize the logistic regression model
lr_model = LogisticRegression(learning_rate=0.01, iterations=1000)

# Train and evaluate the model for each class
results = []
class_labels = {0: 'Low', 1: 'Medium', 2: 'High'}

for class_label in [0, 1, 2]:
    y_train_binary = one_vs_rest_labels(y_train, class_label)
    y_test_binary = one_vs_rest_labels(y_test, class_label)
    
    lr_model.fit(X_train, y_train_binary)
    y_pred = lr_model.predict(X_test)
    
    accuracy = accuracy_score(y_test_binary, y_pred)
    precision = precision_score(y_test_binary, y_pred)
    recall = recall_score(y_test_binary, y_pred)
    f1 = f1_score(y_test_binary, y_pred)
    
    results.append((class_labels[class_label], accuracy, precision, recall, f1))

# Display the results
results_df = pd.DataFrame(results, columns=['Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)


    Class  Accuracy  Precision    Recall  F1 Score
0     Low  0.890533   0.765195  0.993821  0.864650
1  Medium  0.682094   0.071429  0.000051  0.000102
2    High  0.974872   0.972203  0.951156  0.961564
