In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits, fetch_covtype
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler

#### MNIST digits

In [2]:
# Download and load MNIST Digits dataset
digits = load_digits()
digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'frame': None,
 'feature_names': ['pixel_0_0',
  'pixel_0_1',
  'pixel_0_2',
  'pixel_0_3',
  'pixel_0_4',
  'pixel_0_5',
  'pixel_0_6',
  'pixel_0_7',
  'pixel_1_0',
  'pixel_1_1',
  'pixel_1_2',
  'pixel_1_3',
  'pixel_1_4',
  'pixel_1_5',
  'pixel_1_6',
  'pixel_1_7',
  'pixel_2_0',
  'pixel_2_1',
  'pixel_2_2',
  'pixel_2_3',
  'pixel_2_4',
  'pixel_2_5',
  'pixel_2_6',
  'pixel_2_7',
  'pixel_3_0',
  'pixel_3_1',
  'pixel_3_2',
  'pixel_3_3',
  'pixel_3_4',
  'pixel_3_5',
  'pixel_3_6',
  'pixel_3_7',
  'pixel_4_0',
  'pixel_4_1',
  'pixel_4_2',
  'pixel_4_3',
  'pixel_4_4',
  'pixel_4_5',
  'pixel_4_6',
  'pixel_4_7',
  'pixel_5_0',
  'pixel_5_1',
 

In [3]:
digit_df = pd.DataFrame(digits['data'], columns=digits['feature_names'])
digit_df

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


In [4]:
digit_target_df = pd.DataFrame(digits['target'], columns=['digit'])

In [5]:
digit_df = pd.concat([digit_df, digit_target_df], axis=1)

In [6]:
# Use 8 and 9 as target class they are difficult to seperate in 2 dimensions unlike 0 and 1
# Remove the other classes

digit_df = digit_df[digit_df['digit'] != 0]
digit_df = digit_df[digit_df['digit'] != 1]
digit_df = digit_df[digit_df['digit'] != 2]
digit_df = digit_df[digit_df['digit'] != 3]
digit_df = digit_df[digit_df['digit'] != 4]
digit_df = digit_df[digit_df['digit'] != 5]
digit_df = digit_df[digit_df['digit'] != 6]
digit_df = digit_df[digit_df['digit'] != 7]

# Retrieve a sample of the dataset for classification
digit_df = digit_df.sample(n=200, random_state=27).reset_index(drop=True)
digit_data_df = digit_df.drop(columns='digit')
digit_target_df['digit'] = digit_df['digit']
digit_target_df.dropna(inplace=True)

In [7]:
# Check if target class distribution is balanced
digit_target_df['digit'].value_counts()

digit
9.0    104
8.0     96
Name: count, dtype: int64

In [8]:
# Balance using seeded under-sampling
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=27)

digit_data_df, digit_target_df = undersampler.fit_resample(digit_data_df,
                                                         digit_target_df)

In [9]:
digit_target_df.value_counts()

digit
8.0      96
9.0      96
Name: count, dtype: int64

In [10]:
# Bring together the target data and features
digit_df = pd.concat([digit_data_df, digit_target_df], axis=1)

In [11]:
digit_df.tail()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,digit
187,0.0,0.0,3.0,12.0,5.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,13.0,14.0,7.0,0.0,0.0,9.0
188,0.0,0.0,5.0,11.0,13.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,16.0,16.0,6.0,0.0,9.0
189,0.0,0.0,0.0,4.0,11.0,15.0,15.0,2.0,0.0,1.0,...,0.0,0.0,0.0,0.0,3.0,16.0,2.0,0.0,0.0,9.0
190,0.0,0.0,0.0,0.0,8.0,14.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,9.0
191,0.0,0.0,5.0,14.0,11.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,12.0,3.0,0.0,9.0


In [12]:
digit_df['digit'].value_counts()

digit
8.0    96
9.0    96
Name: count, dtype: int64

In [13]:
# Replace the digit class values with binary -1 and 1 for classification
digit_df['digit'] = digit_df['digit'].replace([8.0, 9.0], [-1, 1])

In [14]:
digit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 65 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pixel_0_0  192 non-null    float64
 1   pixel_0_1  192 non-null    float64
 2   pixel_0_2  192 non-null    float64
 3   pixel_0_3  192 non-null    float64
 4   pixel_0_4  192 non-null    float64
 5   pixel_0_5  192 non-null    float64
 6   pixel_0_6  192 non-null    float64
 7   pixel_0_7  192 non-null    float64
 8   pixel_1_0  192 non-null    float64
 9   pixel_1_1  192 non-null    float64
 10  pixel_1_2  192 non-null    float64
 11  pixel_1_3  192 non-null    float64
 12  pixel_1_4  192 non-null    float64
 13  pixel_1_5  192 non-null    float64
 14  pixel_1_6  192 non-null    float64
 15  pixel_1_7  192 non-null    float64
 16  pixel_2_0  192 non-null    float64
 17  pixel_2_1  192 non-null    float64
 18  pixel_2_2  192 non-null    float64
 19  pixel_2_3  192 non-null    float64
 20  pixel_2_4 

In [15]:
# Output to data folder
digit_df.to_csv('data/digits_2.csv', index=False)

#### Mushroom dataset

In [16]:
# Load  data from CSV in assets folder
mushroom_df = pd.read_csv('assets/agaricus-lepiota.csv', header=None)

In [17]:
mushroom_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [18]:
# Check for missing values reprersented as '?' described on UCI ML repository dataset page
mushroom_df = mushroom_df.replace({'?':np.NaN})

In [19]:
mushroom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   object
 2   2       8124 non-null   object
 3   3       8124 non-null   object
 4   4       8124 non-null   object
 5   5       8124 non-null   object
 6   6       8124 non-null   object
 7   7       8124 non-null   object
 8   8       8124 non-null   object
 9   9       8124 non-null   object
 10  10      8124 non-null   object
 11  11      5644 non-null   object
 12  12      8124 non-null   object
 13  13      8124 non-null   object
 14  14      8124 non-null   object
 15  15      8124 non-null   object
 16  16      8124 non-null   object
 17  17      8124 non-null   object
 18  18      8124 non-null   object
 19  19      8124 non-null   object
 20  20      8124 non-null   object
 21  21      8124 non-null   object
 22  22      8124 non-null   

In [20]:
# Drop rows with missing values since only a sample of full dataset is used
mushroom_df = mushroom_df.dropna().reset_index(drop=True)

In [21]:
mushroom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5644 entries, 0 to 5643
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5644 non-null   object
 1   1       5644 non-null   object
 2   2       5644 non-null   object
 3   3       5644 non-null   object
 4   4       5644 non-null   object
 5   5       5644 non-null   object
 6   6       5644 non-null   object
 7   7       5644 non-null   object
 8   8       5644 non-null   object
 9   9       5644 non-null   object
 10  10      5644 non-null   object
 11  11      5644 non-null   object
 12  12      5644 non-null   object
 13  13      5644 non-null   object
 14  14      5644 non-null   object
 15  15      5644 non-null   object
 16  16      5644 non-null   object
 17  17      5644 non-null   object
 18  18      5644 non-null   object
 19  19      5644 non-null   object
 20  20      5644 non-null   object
 21  21      5644 non-null   object
 22  22      5644 non-null   

In [22]:
# Retrieve a sample of the dataset for classification
mushroom_df = mushroom_df.sample(n=200, random_state=27).reset_index(drop=True)

In [23]:
# Assign column names according to UCI Mushroom description
# https://archive.ics.uci.edu/ml/dataset/73/mushroom

mushroom_df.rename({0:'edible', 1: 'cap-shape',
                2:'cap-surface',  3: 'cap-color',
                4: 'bruises', 5: 'odor', 6: 'gill-attachment',
                7: 'gill-spacing', 8: 'gill-size', 9: 'gill-color',
                10: 'stalk-shape', 11: 'stalk-root', 12: 'stalk-surface-above-ring',
                13: 'stalk-surface-below-ring', 14: 'stalk-color-above-ring', 
                15: 'stalk-color-below-ring', 16: 'veil-type', 17: 'veil-color',
                18: 'ring-number', 19: 'ring-type', 20: 'spore-print-color', 
                21: 'population', 22: 'habitat'}, axis=1, inplace=True)
mushroom_df.head()

Unnamed: 0,edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,s,w,f,n,f,w,b,k,...,f,w,w,p,w,o,e,n,a,g
1,p,x,s,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
2,e,x,s,n,f,n,f,w,b,k,...,s,w,w,p,w,o,e,k,a,g
3,p,x,s,g,f,c,f,w,n,g,...,s,w,w,p,w,o,p,n,s,d
4,e,x,y,w,t,l,f,c,b,g,...,s,w,w,p,w,o,p,n,n,g


In [24]:
# Swap edible (Target column) to be the final column

columns = list(mushroom_df.columns)
edible_index, habitat_index = columns.index('edible'), columns.index('habitat')
columns[habitat_index], columns[edible_index] = columns[edible_index], columns[habitat_index]
mushroom_df = mushroom_df[columns]
mushroom_df.head()

Unnamed: 0,habitat,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,edible
0,g,x,s,w,f,n,f,w,b,k,...,f,w,w,p,w,o,e,n,a,e
1,u,x,s,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,p
2,g,x,s,n,f,n,f,w,b,k,...,s,w,w,p,w,o,e,k,a,e
3,d,x,s,g,f,c,f,w,n,g,...,s,w,w,p,w,o,p,n,s,p
4,g,x,y,w,t,l,f,c,b,g,...,s,w,w,p,w,o,p,n,n,e


In [25]:
# Replace categorical strings with numerical representations
mushroom_df = mushroom_df[mushroom_df.columns].apply(lambda x: pd.factorize(x)[0])

In [26]:
data_columns = list(mushroom_df.columns)

In [27]:
# Seperate the feature columns
data_columns.remove('edible')
data_columns

['habitat',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population']

In [28]:
mushroom_df

Unnamed: 0,habitat,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,edible
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,1,0,1,1,1,...,1,0,0,0,0,0,1,1,1,1
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,2,0,0,2,0,2,0,0,1,2,...,1,0,0,0,0,0,1,0,1,1
4,0,0,1,0,1,3,0,1,0,2,...,1,0,0,0,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,3,0,1,3,1,3,0,1,0,1,...,3,0,0,0,0,0,1,1,1,0
196,0,1,2,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
197,2,1,2,1,1,0,0,1,0,6,...,1,1,3,0,0,0,1,1,3,0
198,0,0,2,3,0,4,0,1,0,3,...,2,2,4,0,0,0,2,2,4,1


In [29]:
# Replace the edible or poisonous strings with 1 and -1 for classification
mushroom_df['edible'] = mushroom_df['edible'].replace(['e', 'p'], [1, -1])

In [30]:
mushroom_target_df = mushroom_df['edible']

In [31]:
mushroom_target_df.head()

0    0
1    1
2    0
3    1
4    0
Name: edible, dtype: int64

In [32]:
mushroom_data_df = mushroom_df.drop(columns='edible')

In [33]:
mushroom_data_df.head()

Unnamed: 0,habitat,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,1,0,1,1,1,...,0,1,0,0,0,0,0,1,1,1
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,2,0,0,2,0,2,0,0,1,2,...,0,1,0,0,0,0,0,1,0,1
4,0,0,1,0,1,3,0,1,0,2,...,0,1,0,0,0,0,0,1,0,2


In [34]:
# Check if the dataset is balanced
mushroom_target_df.value_counts()

edible
0    123
1     77
Name: count, dtype: int64

In [35]:
# Balance using seeded random under-sampling
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=27)

mushroom_data_df, mushroom_target_df = undersampler.fit_resample(mushroom_data_df,
                                                             mushroom_target_df)

In [36]:
mushroom_target_df.value_counts()

edible
0    77
1    77
Name: count, dtype: int64

In [37]:
# Output to data folder
mushroom_df = pd.concat([mushroom_data_df, mushroom_target_df], axis=1)
mushroom_df.to_csv('data/mushroom.csv', index=False)

#### Car Acceptability

In [38]:
# Load in raw Car Acceptability data from assets folde
car_df = pd.read_csv('assets/car.csv', header=None)

In [39]:
car_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [40]:
# Rename columns according to UCI ML repository
car_df.rename({0:'Buying_Price', 1: 'Maintenance_Price',
                2:'No_of_Doors',  3: 'Person_Capacity',
                4: 'Size_of_Luggage', 5: 'Safety',
                6: 'Car_Acceptability'}, axis=1, inplace=True)
car_df.head()

Unnamed: 0,Buying_Price,Maintenance_Price,No_of_Doors,Person_Capacity,Size_of_Luggage,Safety,Car_Acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [41]:
# Reduce dataset sample size to be able to run classificaition
car_df = car_df.sample(n=375, random_state=27).reset_index(drop=True)

In [42]:
car_df.head()

Unnamed: 0,Buying_Price,Maintenance_Price,No_of_Doors,Person_Capacity,Size_of_Luggage,Safety,Car_Acceptability
0,high,med,3,more,med,low,unacc
1,med,med,3,more,big,low,unacc
2,low,low,2,2,small,med,unacc
3,vhigh,high,4,2,med,med,unacc
4,high,low,2,4,small,high,acc


In [43]:
# Drop classes so only unacceptable and acceptable are left 
# (Making it a binary classification problem)

car_df = car_df[car_df['Car_Acceptability'] != 'good']
car_df = car_df[car_df['Car_Acceptability'] != 'vgood']
car_df['Car_Acceptability'].value_counts()

Car_Acceptability
unacc    275
acc       70
Name: count, dtype: int64

In [44]:
# Replace acceptable and unacceptable with -1 and 1 for classification
car_df['Car_Acceptability'] = car_df['Car_Acceptability'].replace(['unacc', 'acc'], [-1,1])

In [45]:
# Numerically encode the features
car_df['Buying_Price'] = car_df['Buying_Price'].replace(['vhigh', 'high', 'med', 'low'], [3, 2, 1, 0])
car_df['Maintenance_Price'] = car_df['Maintenance_Price'].replace(['vhigh', 'high', 'med', 'low'], [3, 2, 1, 0])
car_df['No_of_Doors'] = car_df['No_of_Doors'].replace('5more', 5)
car_df['Person_Capacity'] = car_df['Person_Capacity'].replace('more', 5)
car_df['Size_of_Luggage'] = car_df['Size_of_Luggage'].replace(['small', 'med', 'big'], [0, 1, 2])
car_df['Safety'] = car_df['Safety'].replace(['low', 'med', 'high'], [0, 1, 2])


In [46]:
car_df.head()

Unnamed: 0,Buying_Price,Maintenance_Price,No_of_Doors,Person_Capacity,Size_of_Luggage,Safety,Car_Acceptability
0,2,1,3,5,1,0,-1
1,1,1,3,5,2,0,-1
2,0,0,2,2,0,1,-1
3,3,2,4,2,1,1,-1
4,2,0,2,4,0,2,1


In [47]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 345 entries, 0 to 374
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Buying_Price       345 non-null    int64 
 1   Maintenance_Price  345 non-null    int64 
 2   No_of_Doors        345 non-null    object
 3   Person_Capacity    345 non-null    object
 4   Size_of_Luggage    345 non-null    int64 
 5   Safety             345 non-null    int64 
 6   Car_Acceptability  345 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 21.6+ KB


In [48]:
# Seperate features
car_df_data = car_df.drop(columns='Car_Acceptability')

In [49]:
# Seperate targets
car_df_target = car_df['Car_Acceptability']

In [50]:
# Check if target class distribution is balanced
car_df_target.value_counts()

Car_Acceptability
-1    275
 1     70
Name: count, dtype: int64

In [51]:
# Balance using random under-sampling
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=27)

car_df_data, car_df_target = undersampler.fit_resample(car_df_data,
                                                        car_df_target)

In [52]:
car_df_target.value_counts()

Car_Acceptability
-1    70
 1    70
Name: count, dtype: int64

In [53]:
# Output to data folder
car_df = pd.concat([car_df_data, car_df_target], axis=1)
car_df.to_csv('data/car.csv', index=False)

#### Forest Cover binary

In [54]:
# Download and load Forest Cover dataset
cov_data_df, cov_target_df = fetch_covtype(random_state=27, return_X_y=True, as_frame=True)

In [55]:
cov_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 54 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Elevation                           581012 non-null  float64
 1   Aspect                              581012 non-null  float64
 2   Slope                               581012 non-null  float64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64
 4   Vertical_Distance_To_Hydrology      581012 non-null  float64
 5   Horizontal_Distance_To_Roadways     581012 non-null  float64
 6   Hillshade_9am                       581012 non-null  float64
 7   Hillshade_Noon                      581012 non-null  float64
 8   Hillshade_3pm                       581012 non-null  float64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float64
 10  Wilderness_Area_0                   581012 non-null  float64
 11  Wilderness_Area_1         

In [56]:
cov_target_df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 581012 entries, 0 to 581011
Series name: Cover_Type
Non-Null Count   Dtype
--------------   -----
581012 non-null  int32
dtypes: int32(1)
memory usage: 2.2 MB


In [57]:
# Obtain the number of instances for each class
cov_target_df.value_counts()

Cover_Type
2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: count, dtype: int64

In [58]:
# Conbine feature and target dataframes
cov_df = pd.concat([cov_data_df, cov_target_df], axis=1)

# 1 and 2 most common types - create binary dataset from these
cov_df = cov_df[cov_df["Cover_Type"] < 3]

# Sample examples to ensure kernel can run in practical time
cov_df = cov_df.sample(n=250, random_state=27).reset_index(drop=True)
cov_data_df = cov_df.drop(columns='Cover_Type')
cov_target_df = cov_df['Cover_Type']

In [59]:
# Check if the target class distirbution is balanced
cov_target_df.value_counts()

Cover_Type
2    143
1    107
Name: count, dtype: int64

In [60]:
# Balance using random under-sampling
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=27)

cov_data_df, cov_target_df = undersampler.fit_resample(cov_data_df,
                                                             cov_target_df)

In [61]:
cov_target_df.value_counts()

Cover_Type
1    107
2    107
Name: count, dtype: int64

In [62]:
# Replace 1 and 2 with -1 and 1 for classification
cov_target_df.replace([1, 2], [-1, 1], inplace=True)

In [63]:
cov_target_df.value_counts()

Cover_Type
-1    107
 1    107
Name: count, dtype: int64

In [64]:
# Output to data folder
cov_df = pd.concat([cov_data_df, cov_target_df], axis=1)
cov_df.to_csv('data/forest_cov_2.csv', index=False)

### Original Datasets used

Alpaydin, E., Kanyak, C.K. 1998. *Optical Recognition of Handwritten Digits* \[Online]. UCI Machine Learning Repository. Available from: https://doi.org/10.24432/C50P49 [Accessed 3 December 2023].

Blackard, J., 1998. *Covertype* \[Online]. UCI Machine Learning Repository. Available from: https://doi.org/10.24432/C50K5N [Accessed 25 December 2023].

Bohanec, M., 1988. *Car Evaluation* \[Online]. UCI Machine Learning Repository. Available from: https://doi.org/10.24432/C5JP48 [Accessed 27 June 2023].

The Audubon Society Field Guide to North American Mushrooms. 1987. *Mushroom* \[Online]. UCI Machine Learning Repository. https://doi.org/10.24432/C5959T. [Accessed 20 July 2023].