In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import joblib

# Load the dataset
df = pd.read_csv("general-information-of-schools-usage.csv")

print(df.head(10))

   bus_connections zone_code             type_code   nature_code  \
0                1     NORTH     GOVERNMENT SCHOOL  CO-ED SCHOOL   
1               13     NORTH     GOVERNMENT SCHOOL  CO-ED SCHOOL   
2                5     NORTH     GOVERNMENT SCHOOL  CO-ED SCHOOL   
3               11      EAST  GOVERNMENT-AIDED SCH  CO-ED SCHOOL   
4               15     SOUTH     GOVERNMENT SCHOOL  CO-ED SCHOOL   
5                4     SOUTH    INDEPENDENT SCHOOL  CO-ED SCHOOL   
6                6     SOUTH  GOVERNMENT-AIDED SCH  BOYS' SCHOOL   
7                2      WEST     GOVERNMENT SCHOOL  CO-ED SCHOOL   
8                8     NORTH  GOVERNMENT-AIDED SCH  BOYS' SCHOOL   
9                9      WEST  GOVERNMENT-AIDED SCH  CO-ED SCHOOL   

     session_code  secondary_and_jc  sap_ind  autonomous_ind  gifted_ind  \
0  SINGLE SESSION                 0        0               0           0   
1  SINGLE SESSION                 0        0               0           0   
2  SINGLE SESSION      

In [3]:
# Apply categorical data with one-hot encoding
features_df = pd.get_dummies(df, columns=['zone_code', 'type_code', 'nature_code','session_code'])

# Remove the PSLE score from the feature data
del features_df["PSLE_score"]

print(features_df.head(10))

# Create the X and y arrays
X = features_df.to_numpy()
y = df["PSLE_score"].to_numpy()

# Split the dataset into training set and test set (70%/30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

   bus_connections  secondary_and_jc  sap_ind  autonomous_ind  gifted_ind  \
0                1                 0        0               0           0   
1               13                 0        0               0           0   
2                5                 0        0               1           0   
3               11                 0        1               1           0   
4               15                 0        0               0           0   
5                4                 1        0               0           1   
6                6                 0        0               0           0   
7                2                 0        0               1           0   
8                8                 1        1               1           1   
9                9                 0        0               0           0   

   ip_ind  clubs_cca  others_cca  sports_cca  uniform_cca  ...  \
0       0          1           0           5            3  ...   
1       0          1

In [7]:
# Create and fit the model (Grid Search)
model = ensemble.GradientBoostingRegressor(
        n_estimators = 50,
        learning_rate = 0.2,
        max_depth = 5,
        min_samples_leaf = 3,
        max_features = 4,
        loss = "ls",
        subsample = 0.8,
        random_state = 8
)

# Parameters we want to try
param_grid = {
    'n_estimators': [30, 40, 50],
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [1, 2, 3],
    'learning_rate': [0.2, 0.3],
    'loss': ['ls', 'lad', 'huber']
}

gs_cv = GridSearchCV(model, param_grid, n_jobs=4, verbose=100)
gs_cv.fit(X_train, y_train)
print(gs_cv.best_params_)

# While Grid Search comes close to giving us the best combination of hyper-parameters for the training dataset
# This does not corespond to the bext combination for the test dataset
# I still did some individual tuning to obtain optimal hyper-parameters (shown below) for both the training and test dataset 
# {'learning_rate': 0.2, 'loss': 'ls', 'max_depth': 5, 'min_samples_leaf': 3, 'n_estimators': 50}

model.fit(X_train, y_train)

# Find the error rate on the training dataset
mae = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error : ", mae)

# Find the error rate on the test dataset
mae = mean_absolute_error(y_test, model.predict(X_test))
print("Testing Set Mean Absolute Error : ", mae)

# Save the train model
joblib.dump(model, 'trained_PSLE_regressor_model.pkl')

Training Set Mean Absolute Error :  0.7273535541582427
Testing Set Mean Absolute Error :  1.832602066177619


['trained_PSLE_regressor_model.pkl']

In [6]:
# Testing alternative models (Linear Regression)
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, y_train)

# Find the error rate on the training dataset
mae = mean_absolute_error(y_train, LR.predict(X_train))
print("Training Set Mean Absolute Error : ", mae)

# Find the error rate on the test dataset
mae = mean_absolute_error(y_test, LR.predict(X_test))
print("Testing Set Mean Absolute Error : ", mae)

Training Set Mean Absolute Error :  1.789797026363039
Testing Set Mean Absolute Error :  2.478084325283088


In [16]:
# Testing alternative models (Logistic Regression)
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(solver='newton-cg', multi_class='auto')
LR.fit(X_train, y_train)

# Find the error rate on the training dataset
mae = mean_absolute_error(y_train, LR.predict(X_train))
print("Training Set Mean Absolute Error : ", mae)

# Find the error rate on the test dataset
mae = mean_absolute_error(y_test, LR.predict(X_test))
print("Testing Set Mean Absolute Error : ", mae)

Training Set Mean Absolute Error :  1.3113207547169812
Testing Set Mean Absolute Error :  2.760869565217391


In [20]:
# Testing alternative models (Decision Tree Regressor)
from sklearn.tree import DecisionTreeRegressor

DTR = DecisionTreeRegressor()
DTR.fit(X_train, y_train)

# Find the error rate on the training dataset
mae = mean_absolute_error(y_train, DTR.predict(X_train))
print("Training Set Mean Absolute Error : ", mae)

# Find the error rate on the test dataset
mae = mean_absolute_error(y_test, DTR.predict(X_test))
print("Testing Set Mean Absolute Error : ", mae)

Training Set Mean Absolute Error :  0.0
Testing Set Mean Absolute Error :  2.152173913043478


In [33]:
# Testing alternative models (K-Nearest Neighbours)
from sklearn.neighbors import KNeighborsRegressor

KNN = KNeighborsRegressor(n_neighbors=5)
KNN.fit(X_train, y_train)

# Find the error rate on the training dataset
mae = mean_absolute_error(y_train, KNN.predict(X_train))
print("Training Set Mean Absolute Error : ", mae)

# Find the error rate on the test dataset
mae = mean_absolute_error(y_test, KNN.predict(X_test))
print("Testing Set Mean Absolute Error : ", mae)

Training Set Mean Absolute Error :  2.450943396226415
Testing Set Mean Absolute Error :  2.7913043478260864


In [14]:
#Testing alternative models (Default Ensemble Learning Without Hyperparameters)
from sklearn import ensemble

model = ensemble.GradientBoostingRegressor()
model.fit(X_train, y_train)

# Find the error rate on the training dataset
mae = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error : ", mae)

# Find the error rate on the test dataset
mae = mean_absolute_error(y_test, model.predict(X_test))
print("Testing Set Mean Absolute Error : ", mae)

Training Set Mean Absolute Error :  0.7990988425328764
Testing Set Mean Absolute Error :  2.1467309746116476


In [37]:
import numpy as np
features = np.array([[3, # bus connections
                      0,  # secondary_and_jc
                      0,  # sap_ind
                      0,  # autonomous_ind
                      1,  # gifted_ind
                      1,  # ip_ind
                      10,  # clubs_cca
                      4,  # others_cca
                      5,  # sports_cca
                      6,  # uniform_cca
                      9,  # arts_cca
                      3,  # moe_programmes_number
                      0,  # zone_code_EAST
                      0,  # zone_code_NORTH
                      0,  # zone_code_SOUTH
                      1,  # zone_code_WEST
                      0,  # type_code_GOVERNMENT SCHOOL
                      1,  # type_code_GOVERNMENT-AIDED SCH
                      0,  # type_code_INDEPENDENT SCHOOL
                      0,  # type_code_SPECIALISED INDEPENDENT SCHOOL
                      0,  # type_code_SPECIALISED SCHOOL
                      0,  # nature_code_BOYS' SCHOOL
                      1,  # nature_code_CO-ED SCHOOL
                      0,  # nature_code_GIRLS' SCHOOL
                      0,  # session_code_FULL DAY
                      1   # session_code_SINGLE SESSION
                     ]])

print(model.predict(features))

[8.00669505]


In [9]:
feature_labels = features_df.columns.values

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()

# Print each feature label, from most important to least important (reverse order)
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

session_code_SINGLE SESSION - 0.02%
session_code_FULL DAY - 0.04%
nature_code_GIRLS' SCHOOL - 0.07%
type_code_INDEPENDENT SCHOOL - 0.08%
nature_code_BOYS' SCHOOL - 0.21%
secondary_and_jc - 0.26%
others_cca - 0.44%
nature_code_CO-ED SCHOOL - 0.48%
zone_code_NORTH - 0.54%
type_code_SPECIALISED INDEPENDENT SCHOOL - 0.54%
type_code_GOVERNMENT-AIDED SCH - 0.57%
gifted_ind - 0.62%
zone_code_SOUTH - 0.75%
zone_code_WEST - 0.81%
zone_code_EAST - 2.18%
sap_ind - 2.84%
clubs_cca - 4.38%
bus_connections - 4.56%
type_code_SPECIALISED SCHOOL - 4.90%
uniform_cca - 5.29%
ip_ind - 6.29%
type_code_GOVERNMENT SCHOOL - 6.97%
autonomous_ind - 8.27%
arts_cca - 9.55%
sports_cca - 12.58%
moe_programmes_number - 26.76%
