# Classification: Prediction of sex from possum body measures

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

## Load and explore dataset

In [2]:
possum = pd.read_csv('possum.csv')

In [3]:
print(possum.head())

   case  site  Pop sex  age  hdlngth  skullw  totlngth  taill  footlgth  \
0     1     1  Vic   m  8.0     94.1    60.4      89.0   36.0      74.5   
1     2     1  Vic   f  6.0     92.5    57.6      91.5   36.5      72.5   
2     3     1  Vic   f  6.0     94.0    60.0      95.5   39.0      75.4   
3     4     1  Vic   f  6.0     93.2    57.1      92.0   38.0      76.1   
4     5     1  Vic   f  2.0     91.5    56.3      85.5   36.0      71.0   

   earconch   eye  chest  belly  
0      54.5  15.2   28.0   36.0  
1      51.2  16.0   28.5   33.0  
2      51.9  15.5   30.0   34.0  
3      52.2  15.2   28.0   34.0  
4      53.2  15.1   28.5   33.0  


In [4]:
print(possum.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB
None


In [5]:
print(possum.describe())

             case        site         age     hdlngth      skullw    totlngth  \
count  104.000000  104.000000  102.000000  104.000000  104.000000  104.000000   
mean    52.500000    3.625000    3.833333   92.602885   56.883654   87.088462   
std     30.166206    2.349086    1.909244    3.573349    3.113426    4.310549   
min      1.000000    1.000000    1.000000   82.500000   50.000000   75.000000   
25%     26.750000    1.000000    2.250000   90.675000   54.975000   84.000000   
50%     52.500000    3.000000    3.000000   92.800000   56.350000   88.000000   
75%     78.250000    6.000000    5.000000   94.725000   58.100000   90.000000   
max    104.000000    7.000000    9.000000  103.100000   68.600000   96.500000   

            taill    footlgth    earconch         eye       chest       belly  
count  104.000000  103.000000  104.000000  104.000000  104.000000  104.000000  
mean    37.009615   68.459223   48.130769   15.046154   27.000000   32.586538  
std      1.959518    4.395306 

In [6]:
print(possum.value_counts(possum['site']))

site
1    33
7    18
2    13
5    13
6    13
3     7
4     7
dtype: int64


## Drop columns that are not relevant in this context

In [7]:
possum = possum.drop(['case', 'site', 'Pop', 'age'], axis=1)

In [8]:
possum.head()

Unnamed: 0,sex,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,m,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,f,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,f,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,f,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,f,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


## Handle missing data

### Count missing values

In [9]:
print(possum.isna().sum())

sex         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64


### Calculate threshold for dropping observations with missing values

In [10]:
treshold = len(possum) * 0.05
print(treshold)

5.2


### Drop all rows with missing values for columns below treshold

We just have to deal with footlgth with one missing value which is below treshold - drop missing value in this column.

In [11]:
possum.dropna(subset=['footlgth'], inplace=True)
print(possum.isna().sum())

sex         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
dtype: int64


## Get X and y from dataframe

In [12]:
X = possum.iloc[:, 1:-1].values
y = possum.iloc[:, 0].values

In [13]:
print(X)
print(y)

[[ 94.1  60.4  89.   36.   74.5  54.5  15.2  28. ]
 [ 92.5  57.6  91.5  36.5  72.5  51.2  16.   28.5]
 [ 94.   60.   95.5  39.   75.4  51.9  15.5  30. ]
 [ 93.2  57.1  92.   38.   76.1  52.2  15.2  28. ]
 [ 91.5  56.3  85.5  36.   71.   53.2  15.1  28.5]
 [ 93.1  54.8  90.5  35.5  73.2  53.6  14.2  30. ]
 [ 95.3  58.2  89.5  36.   71.5  52.   14.2  30. ]
 [ 94.8  57.6  91.   37.   72.7  53.9  14.5  29. ]
 [ 93.4  56.3  91.5  37.   72.4  52.9  15.5  28. ]
 [ 91.8  58.   89.5  37.5  70.9  53.4  14.4  27.5]
 [ 93.3  57.2  89.5  39.   77.2  51.3  14.9  31. ]
 [ 94.9  55.6  92.   35.5  71.7  51.   15.3  28. ]
 [ 95.1  59.9  89.5  36.   71.   49.8  15.8  27. ]
 [ 95.4  57.6  91.5  36.   74.3  53.7  15.1  28. ]
 [ 92.9  57.6  85.5  34.   69.7  51.8  15.7  28. ]
 [ 91.6  56.   86.   34.5  73.   51.4  14.4  28. ]
 [ 94.7  67.7  89.5  36.5  73.2  53.2  14.7  29. ]
 [ 93.5  55.7  90.   36.   73.7  55.4  15.3  28. ]
 [ 94.4  55.4  90.5  35.   73.4  53.9  15.2  28. ]
 [ 94.8  56.3  89.   38.   73.8

## Encode the dependent variable

In [14]:
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 1 1 1 0 1 0 0 1 0 1 1 1 1 0
 1 0 0 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 0 0 0 0 0 1 1 1 0 1 1 1 0 1
 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0]


## Split data into training and test set

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [16]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[ 93.6  56.2  84.   36.   62.8  42.9  16.2  25. ]
 [ 93.2  57.1  92.   38.   76.1  52.2  15.2  28. ]
 [ 93.8  58.1  89.   38.   66.2  45.6  16.9  26. ]
 [ 98.5  60.7  93.   41.5  71.7  46.8  15.   26. ]
 [ 92.4  56.8  89.   41.   64.5  46.4  17.8  26. ]
 [ 95.3  58.2  89.5  36.   71.5  52.   14.2  30. ]
 [ 90.1  54.8  89.   37.5  66.   45.5  15.   25. ]
 [ 93.2  68.6  84.   35.   65.6  44.3  14.5  28.5]
 [ 86.7  52.6  84.   38.   62.3  44.8  15.   23.5]
 [ 96.9  56.5  89.5  38.5  63.   45.1  17.1  25.5]
 [ 88.4  54.6  80.5  36.   62.6  43.6  16.3  25. ]
 [ 97.8  59.6  89.   38.   65.5  48.   15.   26. ]
 [ 91.8  57.6  84.   35.5  64.2  45.1  14.4  29. ]
 [ 92.   56.4  88.5  38.   64.1  46.3  15.2  25.5]
 [ 93.8  56.8  87.   34.5  73.2  53.   15.3  27. ]
 [ 94.4  55.4  90.5  35.   73.4  53.9  15.2  28. ]
 [ 90.7  56.3  85.   37.   67.6  46.8  14.5  25.5]
 [ 95.7  59.   86.   38.   63.1  44.9  15.   26.5]
 [ 88.2  53.2  86.5  38.5  60.3  43.7  13.6  26. ]
 [ 93.6  59.9  89.   40.   67.6

## Scale data

In [17]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [18]:
print(X_train)
print(X_test)

[[ 2.46790877e-01 -2.08049250e-01 -6.58917100e-01 -4.86178457e-01
  -1.21345656e+00 -1.26346458e+00  1.09948709e+00 -1.02313638e+00]
 [ 1.34519854e-01  9.96468352e-02  1.22863582e+00  4.68710968e-01
   1.73923359e+00  1.06387380e+00  1.69064579e-01  5.06903436e-01]
 [ 3.02926388e-01  4.41531374e-01  5.20803472e-01  4.68710968e-01
  -4.58633515e-01 -5.87785696e-01  1.75078285e+00 -5.13123110e-01]
 [ 1.62211091e+00  1.33043118e+00  1.46457993e+00  2.13976746e+00
   7.62403765e-01 -2.87483970e-01 -1.70199240e-02 -5.13123110e-01]
 [-9.00221922e-02 -2.91852655e-03  5.20803472e-01  1.90104510e+00
  -8.36045038e-01 -3.87584545e-01  2.58816311e+00 -5.13123110e-01]
 [ 7.23942724e-01  4.75719828e-01  6.38775529e-01 -4.86178457e-01
   7.18002410e-01  1.01382351e+00 -7.61357935e-01  1.52692998e+00]
 [-7.35580574e-01 -6.86687605e-01  5.20803472e-01  2.29988611e-01
  -5.03034871e-01 -6.12810840e-01 -1.70199240e-02 -1.02313638e+00]
 [ 1.34519854e-01  4.03131903e+00 -6.58917100e-01 -9.63623169e-01
  -

   7.18002410e-01  1.51432639e+00 -3.89188929e-01 -3.10983703e-03]]


## Create DecisionTree model on training data

In [19]:
classifier = DecisionTreeClassifier(criterion='entropy')
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

## Predict Test set

In [20]:
y_pred = classifier.predict(X_test)

In [21]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [0 0]
 [0 1]]


## Print Confusion Matrix

In [22]:
print(confusion_matrix(y_test, y_pred))

[[4 5]
 [7 5]]


In [23]:
print(accuracy_score(y_test, y_pred) * 100)

42.857142857142854


## Computing the accuracy with k-Fold Cross Validation

In [24]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 57.36 %
Standard Deviation: 9.28 %


## => bad model