# Classification ML

In [1]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [2]:
# preliminary assessment function

def assess_data(file, encoding='utf-8'):
    
    # read file
    df = pd.read_csv(file, encoding=encoding)
    
    # check header
    print("The First Five Observations in DataFrame")
    display(df.head())
    print('\n')
    
    # check tail
    print("The Last Five Observations in DataFrame")
    display(df.tail())
    print('\n')
    
    # check shape of df
    print("The Shape of DataFrame")
    print(df.shape)
    print('\n')
    
    # check info of df
    print("Basic Information of DataFrame")
    print('\n')
    display(df.info())
    print('\n')
    
    # check number of unique values in df
    print("Number of Unique Values in DataFrame")
    print('\n')
    print(df.nunique())
    print('\n')
    
    # check number of missing values in df
    print("Number of Missing Values in DataFrame")
    print('\n')
    print(df.isnull().sum())
    print('\n')
    
    # check number duplicates in df
    print("Number of Duplicates in DataFrame")
    print("Number of duplicates: ", df.duplicated().sum())
    
    return df

In [3]:
# load data and assess data
df = assess_data('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae')

The First Five Observations in DataFrame


Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A




The Last Five Observations in DataFrame


Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
72181,World,2016,5001,BiocapTotGHA,3984702000.0,1504757000.0,5111762779.0,1095445000.0,472616300.0,0.0,12169280000.0,3A
72182,World,2016,5001,EFConsPerCap,0.5336445,0.1402092,0.273495,0.08974253,0.06329435,1.646235,2.746619,3A
72183,World,2016,5001,EFConsTotGHA,3984702000.0,1046937000.0,2042179333.0,670103900.0,472616300.0,12292370000.0,20508910000.0,3A
72184,World,2016,5001,EFProdPerCap,0.5336445,0.1402092,0.273495,0.08974253,0.06329435,1.646235,2.746619,3A
72185,World,2016,5001,EFProdTotGHA,3984702000.0,1046937000.0,2042179333.0,670103900.0,472616300.0,12292370000.0,20508910000.0,3A




The Shape of DataFrame
(72186, 12)


Basic Information of DataFrame


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72186 entries, 0 to 72185
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         72186 non-null  object 
 1   year            72186 non-null  int64  
 2   country_code    72186 non-null  int64  
 3   record          72186 non-null  object 
 4   crop_land       51714 non-null  float64
 5   grazing_land    51714 non-null  float64
 6   forest_land     51714 non-null  object 
 7   fishing_ground  51713 non-null  float64
 8   built_up_land   51713 non-null  float64
 9   carbon          51713 non-null  float64
 10  total           72177 non-null  float64
 11  QScore          72185 non-null  object 
dtypes: float64(6), int64(2), object(4)
memory usage: 6.6+ MB


None



Number of Unique Values in DataFrame


country             193
year                 56
country_code        193
record                8
crop_land         36869
grazing_land      40629
forest_land       49103
fishing_ground    45244
built_up_land     24774
carbon            25493
total             71690
QScore                5
dtype: int64


Number of Missing Values in DataFrame


country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64


Number of Duplicates in DataFrame
Number of duplicates:  0


In [4]:
# check distribution of target variable
# prints
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: QScore, dtype: int64

In [5]:
#for simplicity, we will drop the rows with missing values.
df = df.dropna()
df.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

> An obvious change in our target variable after removing the missing values is that there
are only three classes left and from the distribution of the 3 classes, we can see that
there is an obvious imbalance between the classes. There are methods that can be applied to
handle this imbalance such as oversampling and undersampling.

> Oversampling involves increasing the number of instances in the class with fewer instances
while undersampling involves reducing the data points in the class with more instances.

> For now, we will convert this to a binary classification problem by combining class '2A'
and '1A'.

In [44]:
df['QScore'] = df['QScore'].replace(['1A'], '2A')
df.QScore.value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [45]:
# set sample seed for reproduction
np.random.seed(40)

# create clasification sample
df_2A = df[df.QScore=='2A']
df_3A = df[df.QScore=='3A'].sample(350)
data_df = df_2A.append(df_3A)

In [46]:
import sklearn.utils
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop=True)

# print shape
print(data_df.shape)

# check haeder
data_df.head()

(590, 12)


Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Liberia,2016,123,AreaPerCap,0.1517181,0.4334803,0.899254882,0.7155026,0.035708,0.0,2.235664,2A
1,Canada,1972,33,EFProdTotGHA,51100470.0,10896340.0,88482811.3,13170320.0,697964.7767,135924000.0,300271900.0,3A
2,Costa Rica,1973,48,EFConsPerCap,0.30986,0.4554701,1.227853421,0.03620725,0.061185,0.6592807,2.749857,3A
3,Guyana,1975,91,EFProdTotGHA,383007.5,297518.5,460876.2592,26162.73,39022.0964,644842.2,1851429.0,3A
4,Timor-Leste,2016,176,BiocapPerCap,0.1693559,0.05429385,0.424703833,0.8405822,0.038584,0.0,1.527519,2A


In [47]:
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [48]:
#more preprocessing
data_df = data_df.drop(columns=['country_code', 'country', 'year'])
data_df.head(1)

Unnamed: 0,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,AreaPerCap,0.151718,0.43348,0.899254882,0.715503,0.035708,0.0,2.235664,2A


In [49]:
# divide data to feature and target
X = data_df.drop(columns='QScore')
y = data_df['QScore']

In [50]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train.value_counts()

3A    246
2A    167
Name: QScore, dtype: int64

> There is still an imbalance in the class distribution. For this, we use SMOTE only on the training data to handle this.

In [51]:
#encode categorical variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.transform(x_test.record)

In [52]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_resample(x_train, y_train)

In [53]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = x_train_balanced['record']

In [54]:
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_test.drop(columns=['record']).columns)
normalised_test_df['record'] = x_test['record']

In [55]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(normalised_train_df, y_balanced)

### return
> LogisticRegression(C=1.0, class_weight=None, dual=False, 
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, 
                   multi_class='auto', n_jobs=None, penalty='l2', 
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0, 
                   warm_start=False)

### Cross Validation and Accuracy

#### Performance Metrics

In [56]:
# Confusion Matrix
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

new_predictions = log_reg.predict(normalised_test_df)
cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=['2A', '3A'])
cnf_mat

array([[46, 27],
       [64, 40]], dtype=int64)

In [57]:
# Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 49


In [58]:
# Precision
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Precision: {}'.format(round(precision*100), 2))

Precision: 42


In [59]:
# Recall
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Recall: {}'.format(round(recall*100), 2))

Recall: 63


In [60]:
# F1-Score
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('F1: {}'.format(round(f1*100), 2)) #prints 45.0

F1: 50


In [61]:
# Cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')
scores

array([0.50179727, 0.50505051, 0.46710737, 0.54038562, 0.45777221])

### K-fold Cross Validation

In [62]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
kf.split(normalised_train_df)
f1_scores = []

#run for every split
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
    #save result to list
    f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)

### Stratified K-Fold Cross Validation

In [63]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
f1_scores = []

#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balanced):
    x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
    #save result to list
    f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A'))

### Leave One Out Cross Validation (LOOCV)

In [64]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, scoring='f1_macro')
average_score = scores.mean() * 100

### Tree-Based Methods

In [65]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(normalised_train_df, y_balanced)

In [67]:
dec_pred = dec_tree.predict(normalised_test_df)
dec_pred[:10]

array(['2A', '3A', '3A', '2A', '2A', '3A', '2A', '3A', '2A', '2A', '3A',
       '3A', '2A', '3A', '3A', '2A', '2A', '3A', '3A', '3A', '3A', '3A',
       '2A', '3A', '3A', '3A', '3A', '3A', '2A', '2A', '2A', '3A', '3A',
       '2A', '2A', '3A', '2A', '2A', '3A', '2A', '2A', '3A', '3A', '2A',
       '3A', '3A', '2A', '3A', '2A', '3A', '3A', '3A', '2A', '3A', '3A',
       '3A', '3A', '2A', '2A', '2A', '3A', '3A', '2A', '2A', '2A', '3A',
       '3A', '2A', '2A', '2A', '2A', '3A', '2A', '2A', '3A', '3A', '3A',
       '3A', '2A', '3A', '2A', '2A', '3A', '2A', '2A', '3A', '3A', '2A',
       '2A', '3A', '2A', '2A', '3A', '2A', '3A', '3A', '2A', '3A', '2A',
       '2A', '3A', '2A', '2A', '3A', '3A', '2A', '3A', '2A', '3A', '3A',
       '2A', '2A', '3A', '3A', '3A', '3A', '3A', '2A', '3A', '2A', '3A',
       '2A', '3A', '3A', '3A', '2A', '3A', '3A', '3A', '3A', '3A', '3A',
       '3A', '3A', '2A', '2A', '3A', '2A', '2A', '3A', '3A', '3A', '2A',
       '3A', '2A', '2A', '2A', '2A', '2A', '3A', '3

In [68]:
y_test

1      3A
2      3A
3      3A
6      3A
7      2A
       ..
472    2A
478    2A
479    2A
481    2A
489    2A
Name: QScore, Length: 98, dtype: object