# Classification: Prediction of population from possum body measures (exclude case, site, sex and age)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

## Load and explore dataset

In [None]:
possum = pd.read_csv('possum.csv')

In [None]:
print(possum.head())

In [None]:
print(possum.info())

In [None]:
print(possum.describe())

In [None]:
print(possum.value_counts(possum['site']))

In [None]:
print(possum.value_counts(possum['Pop']))

## Use all features except case, site, sex and age

In [None]:
possum = possum.drop(['case', 'site', 'sex', 'age'], axis=1)

In [None]:
possum.head()

## Handle missing data

### Count missing values

In [None]:
print(possum.isna().sum())

### Calculate threshold for dropping observations with missing values

In [None]:
treshold = len(possum) * 0.05
print(treshold)

### Drop all rows with missing values for columns below treshold

We just have to deal with footlgth with one missing value which is below treshold - drop missing value in this column.

In [None]:
possum.dropna(subset=['footlgth'], inplace=True)
print(possum.isna().sum())

## Get X and y from dataframe

In [None]:
X = possum.iloc[:, 1:-1].values
y = possum.iloc[:, 0].values

In [None]:
print(X)
print(y)

## Encode the dependent variable

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

## Split data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

## Scale data

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)
print(X_test)

## Create K-nearest-neighbors model on training data

In [None]:
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)

## Predict Test set

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Print Confusion Matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(accuracy_score(y_test, y_pred) * 100)

## Computing the accuracy with k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## => A perfect model.... Is this correct???? 