# Machine Learning Excel Assignment 
## Murtaza Hussain (29449) and Muhammad Asad ur Rehman (29456)

In [1]:
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report, r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE, chi2, SequentialFeatureSelector
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from lazypredict.Supervised import LazyClassifier
from IPython.display import display
import statsmodels.api as sm

pd.options.display.float_format = '{:,.4f}'.format

In [2]:
# Data Loader loads data from UCI-ML Repo
def load_datasets():
    # Classification Datasets
    c_cancer = pd.read_csv("./Classification.CancerMB.csv").iloc[:, :32]
    # Regression Datasets
    r_life_expectancy = pd.read_csv("./Regression.Life.Expectancy.csv")
    
    return c_cancer, r_life_expectancy

In [3]:
# This function performs a missing value analysis on each column of the dataset, helps you decide on what to do in cleaning process
def null_check(df):
    null_columns = []
    for column in df.columns:
        print("Column Name:", column)
        print("Column DataType:", df[column].dtype)
        if df[column].dtype != 'float64' and df[column].dtype != 'int64':
            print("Column unique values:", df[column].unique())
        print("Column has null:", df[column].isnull().any())

        
        if df[column].isnull().any() == True:
            print("Column Null Count:", df[column].isnull().sum())
            null_columns.append(column)
        print("\n")
    return null_columns

In [4]:
# This function drops any null columns and missing values
# This is where you decide whether to remove NULL rows (which will reduce the size of Dataset) or remove NULL columns entirely. You can also choose a combination of both.
def clean_data(df, drop_columns, missing_value = False):
    # Remove unnecessary columns
    df.drop(drop_columns, axis=1, inplace=True)
    # Drop rows with any missing values
    if missing_value == False:
        df.dropna(inplace=True)
    else:
        df.fillna(missing_value, inplace=True)
    return df

In [5]:
# Transforms categorical and numberical data into numerical data
def transform_data(df):
    # Encode categorical variables
    label_encoder = LabelEncoder()
    # print("Categorical columns:", df.select_dtypes(include=['object']).columns)
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    
    # Standardize numerical features
    scaler = StandardScaler()
    # print("Numerical columns:", df.select_dtypes(include=['float64', 'int64']).columns)
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numerical_cols) > 0:
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# Classificiation Algorithms

In [6]:
def KNNScore(data, target_col, cv_splits):
    # Split dataset into features and target variable
    X = data.drop(target_col, axis=1)
    y = data[target_col]

    # Initialize KFold with different values
    results = {}

    for cv in cv_splits:
        k_fold = KFold(n_splits=cv, shuffle=True, random_state=42)

        # Lists to store scores
        knn_f1_positive_scores = []
        knn_f1_negative_scores = []
        knn_auc_scores = []
        knn_accuracy_scores = []

        for train_index, test_index in k_fold.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Standardize features by removing the mean and scaling to unit variance
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # KNN Classifier
            knn_classifier = KNeighborsClassifier(n_neighbors=5)
            knn_classifier.fit(X_train_scaled, y_train)
            knn_y_pred = knn_classifier.predict(X_test_scaled)

            # Calculate evaluation metrics for KNN
            knn_f1_positive_scores.append(f1_score(y_test, knn_y_pred, pos_label=1))
            knn_f1_negative_scores.append(f1_score(y_test, knn_y_pred, pos_label=0))
            knn_auc_scores.append(roc_auc_score(y_test, knn_y_pred))
            knn_accuracy_scores.append(accuracy_score(y_test, knn_y_pred))

        # Calculate average scores
        avg_f1_positive_score = sum(knn_f1_positive_scores) / len(knn_f1_positive_scores)
        avg_f1_negative_score = sum(knn_f1_negative_scores) / len(knn_f1_negative_scores)
        avg_auc_score = sum(knn_auc_scores) / len(knn_auc_scores)
        avg_accuracy = sum(knn_accuracy_scores) / len(knn_accuracy_scores)

        # Store results
        results[f'CV_{cv}'] = {
            'F1 Score (Positive)': avg_f1_positive_score,
            'F1 Score (Negative)': avg_f1_negative_score,
            'AUC Score': avg_auc_score,
            'Accuracy': avg_accuracy
        }

    return pd.DataFrame.from_dict(results, orient='index', columns=['F1 Score (Positive)', 'F1 Score (Negative)', 'AUC Score', 'Accuracy'])

In [7]:
def LogisticRegressionScore(data, target_col, cv_splits):
    # Split dataset into features and target variable
    X = data.drop(target_col, axis=1)
    y = data[target_col]

    # Initialize dictionary to store results
    results = {}

    for cv in cv_splits:
        # Initialize KFold
        k_fold = KFold(n_splits=cv, shuffle=True, random_state=42)

        # Lists to store scores
        log_reg_f1_positive_scores = []
        log_reg_f1_negative_scores = []
        log_reg_auc_scores = []
        log_reg_accuracy_scores = []

        for train_index, test_index in k_fold.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Standardize features by removing the mean and scaling to unit variance
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Logistic Regression
            log_reg_classifier = LogisticRegression()
            log_reg_classifier.fit(X_train_scaled, y_train)
            log_reg_y_pred = log_reg_classifier.predict(X_test_scaled)

            # Calculate evaluation metrics for Logistic Regression
            log_reg_f1_positive_scores.append(f1_score(y_test, log_reg_y_pred, pos_label=1))
            log_reg_f1_negative_scores.append(f1_score(y_test, log_reg_y_pred, pos_label=0))
            log_reg_auc_scores.append(roc_auc_score(y_test, log_reg_y_pred))
            log_reg_accuracy_scores.append(accuracy_score(y_test, log_reg_y_pred))

        # Calculate average scores
        avg_f1_positive_score = sum(log_reg_f1_positive_scores) / len(log_reg_f1_positive_scores)
        avg_f1_negative_score = sum(log_reg_f1_negative_scores) / len(log_reg_f1_negative_scores)
        avg_auc_score = sum(log_reg_auc_scores) / len(log_reg_auc_scores)
        avg_accuracy = sum(log_reg_accuracy_scores) / len(log_reg_accuracy_scores)

        # Store results
        results[f'CV_{cv}'] = {
            'F1 Score (Positive)': avg_f1_positive_score,
            'F1 Score (Negative)': avg_f1_negative_score,
            'AUC Score': avg_auc_score,
            'Accuracy': avg_accuracy
        }

    return pd.DataFrame.from_dict(results, orient='index', columns=['F1 Score (Positive)', 'F1 Score (Negative)', 'AUC Score', 'Accuracy'])


# Regression Algorithms

In [8]:
def KNNRegressionMetrics(data, target_col, cv_splits):
    # Split dataset into features and target variable
    X = data.drop(target_col, axis=1)
    y = data[target_col]
    
    # Define the number of neighbors for KNN
    n_neighbors = 5

    results = {}

    for cv in cv_splits:
        # Initialize lists to store scores for each CV split
        mae_scores = []
        mse_scores = []
        rmse_scores = []
        r_squared_scores = []
        aic_scores = []
        bic_scores = []
        p_values_list = []

        # Initialize KFold
        k_fold = KFold(n_splits=cv, shuffle=True, random_state=42)

        for train_index, test_index in k_fold.split(X):
            # Reset scores for each fold within a CV split
            fold_mae = []
            fold_mse = []
            fold_rmse = []
            fold_r_squared = []
            fold_aic = []
            fold_bic = []
            fold_p_values = []

            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Standardize features by removing the mean and scaling to unit variance
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # KNN Regressor
            knn_regressor = KNeighborsRegressor(n_neighbors=n_neighbors)
            knn_regressor.fit(X_train_scaled, y_train)
            y_pred = knn_regressor.predict(X_test_scaled)

            # Calculate evaluation metrics for this fold
            fold_mae.append(mean_absolute_error(y_test, y_pred))
            fold_mse.append(mean_squared_error(y_test, y_pred))
            fold_rmse.append(mean_squared_error(y_test, y_pred, squared=False))
            fold_r_squared.append(r2_score(y_test, y_pred))

            # AIC, BIC, and p-values using OLS regression
            X_train_sm = sm.add_constant(X_train)
            ols_model = sm.OLS(y_train, X_train_sm)
            ols_results = ols_model.fit()
            fold_aic.append(ols_results.aic)
            fold_bic.append(ols_results.bic)
            fold_p_values.append(ols_results.pvalues.drop('const'))  # Exclude p-value of intercept

            # Append scores for this fold
            mae_scores.append(fold_mae)
            mse_scores.append(fold_mse)
            rmse_scores.append(fold_rmse)
            r_squared_scores.append(fold_r_squared)
            aic_scores.append(fold_aic)
            bic_scores.append(fold_bic)
            p_values_list.append(fold_p_values)

        # Calculate average scores across all folds for this CV split
        avg_mae = np.mean(mae_scores)
        avg_mse = np.mean(mse_scores)
        avg_rmse = np.mean(rmse_scores)
        avg_r_squared = np.mean(r_squared_scores)
        avg_aic = np.mean(aic_scores)
        avg_bic = np.mean(bic_scores)
        avg_p_values = np.mean(p_values_list, axis=0)  # Take the mean along the columns

        # Store results
        results[f'CV_{cv}'] = {
            'MAE': avg_mae,
            'MSE': avg_mse,
            'RMSE': avg_rmse,
            'R-Squared': avg_r_squared,
            'AIC': avg_aic,
            'BIC': avg_bic,
            'P-Values': avg_p_values
        }

    return pd.DataFrame.from_dict(results, orient='index')

In [9]:
def OLSRegressionMetrics(data, target_col, cv_splits):
    # Split dataset into features and target variable
    X = data.drop(target_col, axis=1)
    y = data[target_col]

    results = {}

    for cv in cv_splits:
        # Initialize KFold
        k_fold = KFold(n_splits=cv, shuffle=True, random_state=42)

        # Lists to store scores
        mae_scores = []
        mse_scores = []
        rmse_scores = []
        r_squared_scores = []
        aic_scores = []
        bic_scores = []
        p_values_list = []

        for train_index, test_index in k_fold.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Standardize features by removing the mean and scaling to unit variance
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Add constant to features for OLS
            X_train_sm = sm.add_constant(X_train_scaled)

            # OLS Regression
            ols_model = sm.OLS(y_train, X_train_sm)
            ols_results = ols_model.fit()

            # Predictions
            X_test_sm = sm.add_constant(X_test_scaled)
            y_pred = ols_results.predict(X_test_sm)

            # Calculate evaluation metrics
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            r_squared = r2_score(y_test, y_pred)
            aic = ols_results.aic
            bic = ols_results.bic
            p_values = ols_results.pvalues.drop('const')  # Exclude p-value of intercept

            # Append scores
            mae_scores.append(mae)
            mse_scores.append(mse)
            rmse_scores.append(rmse)
            r_squared_scores.append(r_squared)
            aic_scores.append(aic)
            bic_scores.append(bic)
            p_values_list.append(p_values)

        # Calculate the average scores for each metric
        avg_mae = np.mean(mae_scores)
        avg_mse = np.mean(mse_scores)
        avg_rmse = np.mean(rmse_scores)
        avg_r_squared = np.mean(r_squared_scores)
        avg_aic = np.mean(aic_scores)
        avg_bic = np.mean(bic_scores)

        # Flatten the nested lists of p-values
        flattened_p_values = [item for sublist in p_values_list for item in sublist]

        # Calculate the average p-value for each feature
        avg_p_values = np.mean(flattened_p_values, axis=0)

        # Store results
        results[f'CV_{cv}'] = {
            'MAE': avg_mae,
            'MSE': avg_mse,
            'RMSE': avg_rmse,
            'R-Squared': avg_r_squared,
            'AIC': avg_aic,
            'BIC': avg_bic,
            'P-Values': avg_p_values.tolist()  # Convert numpy array to list
        }

    return pd.DataFrame.from_dict(results, orient='index')

In [10]:
# Loading all Datasets into the required variables
c_cancer, r_life_expectancy = load_datasets()

# Scores

## Dataset 1: Cancer Detection Dataset (Classification)

In [11]:
c_cancer
null_check(c_cancer)
print(c_cancer.info())
c_cancer = transform_data(c_cancer)

# Define the number of splits for cross-validation
cv_splits = [5, 10, 15]

print("\nClassification:")
knn_results = KNNScore(c_cancer, 'diagnosis', cv_splits)
print("\nKNN Classifier:")
display(knn_results)

log_reg_results = LogisticRegressionScore(c_cancer, 'diagnosis', cv_splits)
print("\nLogistic Regression:")
display(log_reg_results)

Column Name: id
Column DataType: int64
Column has null: False


Column Name: diagnosis
Column DataType: object
Column unique values: ['M' 'B']
Column has null: False


Column Name: radius_mean
Column DataType: float64
Column has null: False


Column Name: texture_mean
Column DataType: float64
Column has null: False


Column Name: perimeter_mean
Column DataType: float64
Column has null: False


Column Name: area_mean
Column DataType: float64
Column has null: False


Column Name: smoothness_mean
Column DataType: float64
Column has null: False


Column Name: compactness_mean
Column DataType: float64
Column has null: False


Column Name: concavity_mean
Column DataType: float64
Column has null: False


Column Name: concave points_mean
Column DataType: float64
Column has null: False


Column Name: symmetry_mean
Column DataType: float64
Column has null: False


Column Name: fractal_dimension_mean
Column DataType: float64
Column has null: False


Column Name: radius_se
Column DataType: float64

Unnamed: 0,F1 Score (Positive),F1 Score (Negative),AUC Score,Accuracy
CV_5,0.9426,0.9665,0.949,0.9578
CV_10,0.9468,0.9729,0.9545,0.9648
CV_15,0.9551,0.9744,0.9612,0.9683



Logistic Regression:


Unnamed: 0,F1 Score (Positive),F1 Score (Negative),AUC Score,Accuracy
CV_5,0.9646,0.9789,0.9704,0.9736
CV_10,0.9626,0.9768,0.9684,0.9718
CV_15,0.9706,0.9806,0.9763,0.9771


## Dataset 2: Life Expectancy Dataset (Regression)

In [12]:
r_life_expectancy
null_check(r_life_expectancy)
print(c_cancer.info())
clean_data(r_life_expectancy,[])
r_life_expectancy = transform_data(r_life_expectancy)

# Define the number of splits for cross-validation
cv_splits = [5, 10, 15]

print("\nRegression:")
knn_metrics = KNNRegressionMetrics(r_life_expectancy, 'Life expectancy ', cv_splits)
print("\nKNN Regression Metrics:")
display(knn_metrics)
# For KNN P-value Avg
flattened_p_values = knn_metrics['P-Values'].apply(lambda x: [item for sublist in x for item in sublist])
avg_p_values_by_cv = {}
for cv_split, p_values in flattened_p_values.items():
    avg_p_values_by_cv[cv_split] = np.mean(p_values)
for cv_split, avg_p_value in avg_p_values_by_cv.items():
    print(f"Average p-value for CV_{cv_split}: {avg_p_value}")

ols_metrics = OLSRegressionMetrics(r_life_expectancy, 'Life expectancy ', cv_splits)
print("\nOLS Regression Metrics:")
display(ols_metrics)

Column Name: Country
Column DataType: object
Column unique values: ['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Antigua and Barbuda'
 'Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bhutan' 'Bolivia (Plurinational State of)' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso'
 'Burundi' "Côte d'Ivoire" 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Congo' 'Cook Islands' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia'
 "Democratic People's Republic of Korea"
 'Democratic Republic of the Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia'
 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea'
 'Guinea-Bissau' 'Guyana' 'Haiti' 'Honduras' 'H

Unnamed: 0,MAE,MSE,RMSE,R-Squared,AIC,BIC,P-Values
CV_5,0.2295,0.1129,0.3341,0.8873,1375.1319,1489.1971,"[[0.6279697821386685, 5.5949994153848646e-06, ..."
CV_10,0.2169,0.1034,0.3179,0.8968,1546.2709,1662.9273,"[[0.6414373800908385, 4.346370641099101e-07, 0..."
CV_15,0.2165,0.1031,0.3161,0.8971,1602.8807,1720.3372,"[[0.6337422971958243, 1.949069897124804e-07, 0..."


Average p-value for CV_CV_5: 0.21454547449546657
Average p-value for CV_CV_10: 0.2083206156785833
Average p-value for CV_CV_15: 0.2058086237877868

OLS Regression Metrics:


Unnamed: 0,MAE,MSE,RMSE,R-Squared,AIC,BIC,P-Values
CV_5,0.3157,0.1689,0.4109,0.8289,1375.1319,1489.1971,0.2145
CV_10,0.314,0.1668,0.4083,0.83,1546.2709,1662.9273,0.2083
CV_15,0.3135,0.1666,0.4074,0.8299,1602.8807,1720.3372,0.2058
