# Machine Learning Assignment 2
## Murtaza Hussain (29449) and Muhammad Asad ur Rehman (29456)

In [77]:
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report, r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE, chi2, SequentialFeatureSelector
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from lazypredict.Supervised import LazyClassifier
from IPython.display import display
import statsmodels.api as sm

pd.options.display.float_format = '{:,.4f}'.format

In [78]:
# Data Loader loads data from UCI-ML Repo
def load_datasets():
    # Classification Datasets
    c_cancer = pd.read_csv("./Classification.CancerMB.csv").iloc[:, :32]
    # Regression Datasets
    r_life_expectancy = pd.read_csv("./Regression.Life.Expectancy.csv")
    
    return c_cancer, r_life_expectancy

In [79]:
# This function performs a missing value analysis on each column of the dataset, helps you decide on what to do in cleaning process
def null_check(df):
    null_columns = []
    for column in df.columns:
        print("Column Name:", column)
        print("Column DataType:", df[column].dtype)
        if df[column].dtype != 'float64' and df[column].dtype != 'int64':
            print("Column unique values:", df[column].unique())
        print("Column has null:", df[column].isnull().any())

        
        if df[column].isnull().any() == True:
            print("Column Null Count:", df[column].isnull().sum())
            null_columns.append(column)
        print("\n")
    return null_columns

In [80]:
# This function drops any null columns and missing values
# This is where you decide whether to remove NULL rows (which will reduce the size of Dataset) or remove NULL columns entirely. You can also choose a combination of both.
def clean_data(df, drop_columns, missing_value = False):
    # Remove unnecessary columns
    df.drop(drop_columns, axis=1, inplace=True)
    # Drop rows with any missing values
    if missing_value == False:
        df.dropna(inplace=True)
    else:
        df.fillna(missing_value, inplace=True)
    return df

In [81]:
# Transforms categorical and numberical data into numerical data
def transform_data(df):
    # Encode categorical variables
    label_encoder = LabelEncoder()
    # print("Categorical columns:", df.select_dtypes(include=['object']).columns)
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    
    # Standardize numerical features
    scaler = StandardScaler()
    # print("Numerical columns:", df.select_dtypes(include=['float64', 'int64']).columns)
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numerical_cols) > 0:
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

In [82]:
def evaluate_classifiers(dataset, target_name, classifiers, test_size=0.2, random_state=None):
    
    # Extract features and target variable
    X = dataset.drop(columns=[target_name])
    y = dataset[target_name]
    X = X.values  # Convert DataFrame to numpy array
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize results dictionary
    results = {}

    # Train and evaluate each classifier
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred_prob = clf.predict_proba(X_test)[:, 1]  # Probabilities of positive class
        y_pred = clf.predict(X_test)
        
        f1_positive = f1_score(y_test, y_pred, pos_label=1)
        f1_negative = f1_score(y_test, y_pred, pos_label=0)
        auc = roc_auc_score(y_test, y_pred_prob)
        accuracy = accuracy_score(y_test, y_pred)
        
        results[name] = {"F1 Score (Positive)": f1_positive,
                         "F1 Score (Negative)": f1_negative,
                         "AUC": auc,
                         "Accuracy": accuracy}

    return results

In [83]:
# Loading all Datasets into the required variables
c_cancer, r_life_expectancy = load_datasets()

In [61]:
# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [62]:
c_cancer
null_check(c_cancer)
print(c_cancer.info())

Column Name: id
Column DataType: int64
Column has null: False


Column Name: diagnosis
Column DataType: object
Column unique values: ['M' 'B']
Column has null: False


Column Name: radius_mean
Column DataType: float64
Column has null: False


Column Name: texture_mean
Column DataType: float64
Column has null: False


Column Name: perimeter_mean
Column DataType: float64
Column has null: False


Column Name: area_mean
Column DataType: float64
Column has null: False


Column Name: smoothness_mean
Column DataType: float64
Column has null: False


Column Name: compactness_mean
Column DataType: float64
Column has null: False


Column Name: concavity_mean
Column DataType: float64
Column has null: False


Column Name: concave points_mean
Column DataType: float64
Column has null: False


Column Name: symmetry_mean
Column DataType: float64
Column has null: False


Column Name: fractal_dimension_mean
Column DataType: float64
Column has null: False


Column Name: radius_se
Column DataType: float64

In [63]:
c_cancer = transform_data(c_cancer)
results = evaluate_classifiers(c_cancer, "diagnosis", classifiers, test_size=0.2, random_state=42)
for name, metrics in results.items():
    print(f"Results for {name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print("\n")

Results for Logistic Regression:
F1 Score (Positive): 0.9647058823529412
F1 Score (Negative): 0.979020979020979
AUC: 0.99737962659679
Accuracy: 0.9736842105263158


Results for Decision Tree:
F1 Score (Positive): 0.9176470588235294
F1 Score (Negative): 0.951048951048951
AUC: 0.9323616115296429
Accuracy: 0.9385964912280702


Results for Random Forest:
F1 Score (Positive): 0.9523809523809524
F1 Score (Negative): 0.9722222222222222
AUC: 0.9962332132328857
Accuracy: 0.9649122807017544


Results for Support Vector Machine:
F1 Score (Positive): 0.9761904761904763
F1 Score (Negative): 0.9861111111111112
AUC: 0.9970520799213888
Accuracy: 0.9824561403508771


Results for K-Nearest Neighbors:
F1 Score (Positive): 0.9302325581395349
F1 Score (Negative): 0.9577464788732394
AUC: 0.9811660661644284
Accuracy: 0.9473684210526315


Results for Gradient Boosting:
F1 Score (Positive): 0.9302325581395349
F1 Score (Negative): 0.9577464788732394
AUC: 0.9957418932197838
Accuracy: 0.9473684210526315




In [91]:
def evaluate_regressors(dataset, target_name, regressors, test_size=0.2, random_state=None):
    
    # Extract features and target variable
    X = dataset.drop(columns=[target_name])
    y = dataset[target_name]
    X = X.values  # Convert DataFrame to numpy array
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Initialize results dictionary
    results = {}

    # Train and evaluate each regressor
    for name, reg in regressors.items():
        y_pred = reg.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r_squared = r2_score(y_test, y_pred)
        
        # OLS Regression specific metrics
        if name == 'OLS Regression':
            X_train = sm.add_constant(X_train)
            ols_model = sm.OLS(y_train, X_train).fit()
            p_value = ols_model.pvalues[-1]
            aic = ols_model.aic
            bic = ols_model.bic
        else:
            p_value = None
            aic = None
            bic = None

        results[name] = {"MAE": mae,
                         "MSE": mse,
                         "RMSE": rmse,
                         "R-Squared": r_squared,
                         "AIC": aic,
                         "BIC": bic,
                         "p-value": p_value}

    return results

In [98]:
r_life_expectancy
# Checking for Null Values
null_check(r_life_expectancy)
# Null Values present hence Removing the data
clean_data(r_life_expectancy,[])
print(c_cancer.info())
# Encoding Categorical Data to Numerical
r_life_expectancy = transform_data(r_life_expectancy)

Column Name: Country
Column DataType: int32
Column unique values: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132]
Column has null: False


Column Name: Year
Column DataType: float64
Column has null: False


Column Name: Status
Column DataType: int32
Column unique values: [1 0]
Column has null: False


Column Name: Life expectancy 
Column DataType: float64
Column has null: False


Column Name: Adult Mortality
Column DataType: float64
Column has null: False


Column Name: infant deaths
Column DataType: f

In [97]:
# Define the target variable and features
target_variable = 'Life expectancy '
X = r_life_expectancy.drop(columns=[target_variable])
y = r_life_expectancy[target_variable]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add constant term to the training and test set
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Initialize regressors
regressors = {
    "Linear Regression": LinearRegression(),
    "KNN Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
}

# Train and evaluate each regressor
results = {}
for name, reg in regressors.items():
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_test, y_pred)
    
    # For OLS Regression, use statsmodels to get additional metrics
    if name == 'Linear Regression':
        ols_model = sm.OLS(y_train, X_train).fit()
        p_value = ols_model.pvalues[-1]
        aic = ols_model.aic
        bic = ols_model.bic
    else:
        p_value = None
        aic = None
        bic = None

    # Store results
    results[name] = {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R-Squared": r_squared,
        "AIC": aic,
        "BIC": bic,
        "p-value": p_value
    }

# Print results
for name, metrics in results.items():
    print(f"Results for {name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print()


Results for Linear Regression:
MAE: 0.3128057590533043
MSE: 0.16841453029770723
RMSE: 0.4103833942762636
R-Squared: 0.8166110954278785
AIC: 1374.3614610179447
BIC: 1488.4233023777256
p-value: 9.344671485640795e-46

Results for KNN Regressor:
MAE: 0.19690739105029
MSE: 0.0912881274389576
RMSE: 0.3021392517349535
R-Squared: 0.9005950991171776
AIC: None
BIC: None
p-value: None

Results for Random Forest Regressor:
MAE: 0.1254709167604261
MSE: 0.046390407824902995
RMSE: 0.2153843258570665
R-Squared: 0.9494848451696881
AIC: None
BIC: None
p-value: None

Results for Gradient Boosting Regressor:
MAE: 0.16730344501262098
MSE: 0.06009430880290684
RMSE: 0.2451414057292379
R-Squared: 0.9345624784102496
AIC: None
BIC: None
p-value: None

