In [4]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [25]:
file_path = Path('../data_files/machine_learning_dataset.csv')
df = pd.read_csv(file_path)

In [43]:
df  = df.drop(columns='Unnamed: 0')

In [44]:
# Define the X set

X = df.copy()
X = X.drop('income_group', axis=1)
X.head()

Unnamed: 0,Total_Population,CO2_Emissions_by_Sector_Building,CO2_Emissions_by_Sector_Bunker_Fuels,CO2_Emissions_by_Sector_Electricity_Heat,CO2_Emissions_by_Sector_Energy,CO2_Emissions_by_Sector_Fugitive_Emissions,CO2_Emissions_by_Sector_Industrial_Processes,CO2_Emissions_by_Sector_Land_Use_Change_and_Forestry,CO2_Emissions_by_Sector_Manufacturing_Construction,CO2_Emissions_by_Sector_Other_Fuel_Combustion,CO2_Emissions_by_Sector_Transportation,Net_Electricity_Consumption
0,35977451.0,13.24,1.6,36.44,104.89,9.25,8.25,-2.41,10.69,3.57,31.71,33.84
1,36661438.0,14.61,1.62,39.3,109.95,7.06,8.19,-0.63,10.8,3.96,34.22,38.13
2,37383899.0,16.8,1.56,43.46,118.94,6.46,7.7,-0.63,10.05,3.53,38.64,42.87
3,38140135.0,18.48,1.73,42.96,122.62,6.78,7.3,-0.63,10.01,4.12,40.26,45.2
4,38923688.0,19.21,1.67,47.86,130.45,7.28,8.36,-0.63,10.23,2.37,43.51,49.21


In [45]:
# Define the target set.

y = df['income_group'].values

In [47]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=86)


In [48]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(258, 12)
(86, 12)
(258,)
(86,)


In [49]:
# Scale and normalize the data

# Initiate scaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [52]:
# Confirm the data has scaled correctly - The mean should be 0 and the std dev should be 1.

A = range(12)

for a in A:
    print(np.mean(X_train_scaled[:,a]))
    print(np.std(X_train_scaled[:,a]))

3.4425520143415706e-18
1.0
2.0655312086049424e-17
1.0
-1.3770208057366282e-16
0.9999999999999999
-2.0655312086049424e-17
1.0
4.131062417209885e-17
1.0
6.885104028683141e-18
1.0
-1.0327656043024712e-17
1.0
6.885104028683141e-18
1.0
3.4425520143415706e-18
1.0
-1.8934036078878638e-17
1.0
-4.131062417209885e-17
0.9999999999999999
0.0
1.0


In [53]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [54]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [55]:
predictions

array([1, 0, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 2, 1, 2, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 2, 0, 2, 1, 2, 2, 2, 1, 0, 1, 0, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1,
       2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 2],
      dtype=int64)

In [57]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1","Actual 2","Actual 3"], columns=["Predicted 0", "Predicted 1","Predicted 2","Predicted 3"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,17,4,0,0
Actual 1,0,41,0,0
Actual 2,0,1,21,0
Actual 3,0,0,2,0


In [59]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9186046511627907

In [60]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,17,4,0,0
Actual 1,0,41,0,0
Actual 2,0,1,21,0
Actual 3,0,0,2,0


Accuracy Score : 0.9186046511627907
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.81      0.89        21
           1       0.89      1.00      0.94        41
           2       0.91      0.95      0.93        22
           3       0.00      0.00      0.00         2

    accuracy                           0.92        86
   macro avg       0.70      0.69      0.69        86
weighted avg       0.90      0.92      0.91        86

