In [17]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce

# ML Model Creation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

import statsmodels.api as sm
import mlflow

RANDOM_SEED = 42

current_dir = os.getcwd()
data_fldr = os.path.join(current_dir, "data")
out_dir = os.path.join(current_dir, "data", "output")

In [18]:
print(current_dir)

c:\Users\Max Lee\Documents\Repos\internal-mlops\coe_model


# Datasets

In [19]:
coe_df = pd.read_excel(os.path.join(data_fldr, "COE_Export.xlsx"), sheet_name="Yearly")
pp_df = pd.read_excel(os.path.join(data_fldr, "Population.xlsx"), sheet_name="Consolidate")


In [20]:
# coe_df
# cpi_df

## Choosing of COE Category

In [21]:
coe_cat_df = coe_df.loc[coe_df['Category'] == "A", :]
coe_cat_df

Unnamed: 0,Year,Category,Value
0,2000,A,38981.083333
5,2001,A,27031.791667
10,2002,A,30831.916667
15,2003,A,28754.875
20,2004,A,25180.916667
25,2005,A,16550.791667
30,2006,A,11187.166667
35,2007,A,14101.125
40,2008,A,12330.291667
45,2009,A,11600.041667


## Joining of Datasets

### Only race

In [22]:
coe_pp = pd.merge(left=coe_cat_df, right=pp_df, left_on="Year", right_on="Year", how="left")
coe_pp_drop = coe_pp.drop(['Year','Category',], axis=1).drop([24], axis=0)
# coe_pp_drop = coe_pp_drop.iloc[:, [0, 4,7,10,13]]

# coe_pp_drop.corr()

# Machine Learning Portion

In [23]:
def log_scale(X):
    return np.log1p(X)

In [24]:
# Initialize FunctionTransformer
transformer = FunctionTransformer(log_scale)

In [25]:
# Step 4: Split your data into training and testing sets
X = coe_pp_drop.drop('Value', axis=1)  # Features
y = coe_pp_drop['Value']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_log_scaled = transformer.transform(X_train)
# Add a constant to the X_train_log_scaled for the intercept term
X_train_log_scaled_with_const = sm.add_constant(X_train_log_scaled)

X_train_log_scaled_with_const

  x = pd.concat(x[::order], 1)


Unnamed: 0,const,Total Residents,Total Male Residents,Total Female Residents,Total Malays,Total Male Malays,Total Female Malays,Total Chinese,Total Male Chinese,Total Female Chinese,Total Indians,Total Male Indians,Total Female Indians,Total Other Ethnic Groups,Males Other Ethnic Groups,Females Other Ethnic Groups
9,1.0,15.132958,14.427845,14.451635,13.122467,12.425674,12.432957,14.834468,14.123161,14.159157,12.746972,12.090269,12.016007,11.695364,10.955445,11.046914
13,1.0,15.16222,14.452883,14.485004,13.147715,12.449713,12.459404,14.864145,14.148537,14.192966,12.770443,12.105644,12.048126,11.747974,11.006374,11.101055
1,1.0,15.017252,14.32146,14.326743,13.042863,12.354368,12.345047,14.752418,14.052178,14.066315,12.479791,11.823977,11.747871,10.801003,10.046938,10.165313
21,1.0,15.19851,14.484936,14.525382,13.207537,12.512616,12.516164,14.900746,14.180747,14.233748,12.779552,12.115026,12.056946,11.754859,10.966887,11.148334
5,1.0,15.059035,14.358497,14.373225,13.083046,12.390089,12.389714,14.781248,14.07622,14.099843,12.581532,11.915867,11.860133,11.14532,10.412231,10.490607
2,1.0,15.034257,14.336858,14.345344,13.056995,12.367668,12.360016,14.766985,14.064995,14.082605,12.513278,11.854876,11.784143,10.880629,10.130225,10.241673
12,1.0,15.155291,14.446807,14.47725,13.141224,12.443431,12.452706,14.856488,14.141649,14.184574,12.768616,12.105971,12.044012,11.741478,11.000682,11.093828
15,1.0,15.177177,14.466078,14.501665,13.163359,12.465004,12.475397,14.880224,14.162855,14.210727,12.779741,12.113436,12.059016,11.750437,11.000665,11.110894
3,1.0,15.029501,14.330369,14.342303,13.060045,12.369882,12.363909,14.760431,14.057089,14.077376,12.505807,11.839658,11.78492,10.907698,10.162268,10.264269
4,1.0,15.04318,14.343212,14.356809,13.072522,12.381021,12.377729,14.77095,14.066703,14.088782,12.535761,11.868885,11.815641,10.9964,10.257133,10.347372


## Linear Regression

In [26]:
# Step 5: Initialize and train your multilinear regression model
model = LinearRegression()
# Fit the linear regression model using statsmodels
model_linear = model.fit(X_train, y_train)
model_stats = sm.OLS(y_train, X_train_log_scaled_with_const).fit()

# Step 6: Evaluate the model on the test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 275999705.4379314


In [27]:
# Print the summary to see the p-values
print(model_stats.summary())

                            OLS Regression Results                            
Dep. Variable:                  Value   R-squared:                       0.991
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     22.89
Date:                Wed, 12 Jun 2024   Prob (F-statistic):             0.0126
Time:                        23:24:52   Log-Likelihood:                -173.46
No. Observations:                  19   AIC:                             378.9
Df Residuals:                       3   BIC:                             394.0
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           



In [28]:
# Step 7: Analyze the coefficients to determine the importance of factors
coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': model.coef_})
coefficients = coefficients.sort_values(by='coefficient', ascending=False)
print("Coefficients:")
print(coefficients)

Coefficients:
                         feature   coefficient
0                Total Residents  2.136949e+12
1           Total Male Residents  1.493564e+12
2         Total Female Residents  3.059000e+11
11          Total Female Indians -4.983248e+10
8           Total Female Chinese -4.139264e+11
14  Females Other Ethnic Groups  -4.187772e+11
5            Total Female Malays -4.338086e+11
10            Total Male Indians -1.237497e+12
7             Total Male Chinese -1.601590e+12
13    Males Other Ethnic Groups  -1.606441e+12
4              Total Male Malays -1.621473e+12
3                   Total Malays -2.009040e+12
12    Total Other Ethnic Groups  -2.024072e+12
6                  Total Chinese -2.028922e+12
9                  Total Indians -2.393016e+12


## Decision Trees

### Random Forest Regression

In [29]:
rf = RandomForestRegressor(random_state=RANDOM_SEED)
param_grid_forest = {
    'n_estimators': [200,400, 700],
    'max_depth': [10,20,30],
    'criterion' : ["squared_error"],
    'max_leaf_nodes': [50, 100]
}

grid_forest = GridSearchCV(
        estimator=rf,
        param_grid=param_grid_forest, 
        cv=5, 
        n_jobs=-1, 
        scoring='neg_mean_squared_error',
        verbose=0
    )

model_forest = grid_forest.fit(X_train, y_train)

KeyboardInterrupt: 

### Gradient Boosting

# ML Flow Experiement

In [None]:
mlflow.set_experiment("COE_Prediction")

# Model evelaution metrics
def eval_metrics(actual, pred):
    accuracy = metrics.accuracy_score(actual, pred)
    f1 = metrics.f1_score(actual, pred, pos_label=1)
    fpr, tpr, _ = metrics.roc_curve(actual, pred)
    auc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(8,8))
    plt.plot(fpr, tpr, color='blue', label='ROC curve area = %0.2f'%auc)
    plt.plot([0,1],[0,1], 'r--')
    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.xlabel('False Positive Rate', size=14)
    plt.ylabel('True Positive Rate', size=14)
    plt.legend(loc='lower right')
    # Save plot
    os.makedirs("plots", exist_ok=True)
    plt.savefig("plots/ROC_curve.png")
    # Close plot
    plt.close()
    return(accuracy, f1, auc)


def mlflow_logging(model, X, y, name):
    
     with mlflow.start_run() as run:
        mlflow.set_tracking_uri("http://127.0.0.1:5000/")
        run_id = run.info.run_id
        mlflow.set_tag("run_id", run_id)      
        pred = model.predict(X)
        #metrics
        (accuracy, f1, auc) = eval_metrics(y, pred)
        # Logging best parameters from gridsearch
        mlflow.log_params(model.best_params_)
        #log the metrics
        mlflow.log_metric("Mean CV score", model.best_score_)
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("f1-score", f1)
        mlflow.log_metric("AUC", auc)

        # Logging artifacts and model
        mlflow.log_artifact("plots/ROC_curve.png")
        mlflow.sklearn.log_model(model, name)
        
        mlflow.end_run()

In [None]:
mlflow_logging(model_forest, X_test, y_test, "RandomForestRegressor")
mlflow_logging(model_linear, X_test, y_test, "LinearRegression")