In [10]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import preprocessing

In [3]:
original = pd.read_csv("train.csv")
training = pd.read_csv("train.csv")

# Data fields - Info pulled from Kaggle

- id - id of the creature
- bone_length - average length of bone in the creature, normalized between 0 and 1
- rotting_flesh - percentage of rotting flesh in the creature
- hair_length - average hair length, normalized between 0 and 1
- has_soul - percentage of soul in the creature
- color - dominant color of the creature: 'white','black','clear','blue','green','blood'
- type - target variable: 'Ghost', 'Goblin', and 'Ghoul'

In [4]:
training.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


## Missing Data Check

In [5]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 7 columns):
id               371 non-null int64
bone_length      371 non-null float64
rotting_flesh    371 non-null float64
hair_length      371 non-null float64
has_soul         371 non-null float64
color            371 non-null object
type             371 non-null object
dtypes: float64(4), int64(1), object(2)
memory usage: 20.4+ KB


In [6]:
training.describe()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul
count,371.0,371.0,371.0,371.0,371.0
mean,443.67655,0.43416,0.506848,0.529114,0.471392
std,263.222489,0.132833,0.146358,0.169902,0.176129
min,0.0,0.061032,0.095687,0.1346,0.009402
25%,205.5,0.340006,0.414812,0.407428,0.348002
50%,458.0,0.434891,0.501552,0.538642,0.466372
75%,678.5,0.517223,0.603977,0.647244,0.60061
max,897.0,0.817001,0.932466,1.0,0.935721


### The returned information indicates that there are no null values.

#### K-fold; K = 2

In [8]:
features = training[['bone_length','rotting_flesh','hair_length','has_soul','color']].copy()
classifier = training['type'].copy()
print features.columns

 Index([u'bone_length', u'rotting_flesh', u'hair_length', u'has_soul',
       u'color'],
      dtype='object')


### I want to normalize my percentage data first

In [19]:
rotting_flesh_max = features['rotting_flesh'].max()
rotting_flesh_min = features['rotting_flesh'].min()
rotting_flesh_difference = rotting_flesh_max - rotting_flesh_min
features['rotting_flesh_norm'] = features['rotting_flesh'].apply(lambda x: (x - rotting_flesh_min)/(rotting_flesh_difference))

In [21]:
has_soul_max = features['has_soul'].max()
has_soul_min = features['has_soul'].min()
has_soul_difference = has_soul_max - has_soul_min
features['has_soul_norm'] = features['has_soul'].apply(lambda x: (x - has_soul_min)/(has_soul_difference))

In [22]:
features.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,rotting_flesh_norm,has_soul_norm
0,0.354512,0.350839,0.465761,0.781142,clear,0.304922,0.833125
1,0.57556,0.425868,0.531401,0.439899,green,0.394586,0.464739
2,0.467875,0.35433,0.811616,0.791225,black,0.309094,0.84401
3,0.776652,0.508723,0.636766,0.884464,black,0.493602,0.944665
4,0.566117,0.875862,0.418594,0.636438,green,0.932355,0.676911


Now that I have normalized rotting_flesh and has_soul, I want to drop the unnormalize columns from the data set. 
I'm uncertain that the percentage data needed to be normalize, so I'm copy for later use. 

In [28]:
features_copy = features.copy()
features.drop(features.columns[[1,3]], axis = 1, inplace= True)
features.head()

Unnamed: 0,bone_length,hair_length,color,rotting_flesh_norm,has_soul_norm
0,0.354512,0.465761,clear,0.304922,0.833125
1,0.57556,0.531401,green,0.394586,0.464739
2,0.467875,0.811616,black,0.309094,0.84401
3,0.776652,0.636766,black,0.493602,0.944665
4,0.566117,0.418594,green,0.932355,0.676911


In [29]:
fTrain, fTest, cTrain, cTest = train_test_split(features, classifier, test_size = .25)

In [40]:
cTrain.name = 'monster'
cTest.name = 'monster'
cTrain.head()

322    Ghost
52     Ghost
30     Ghoul
180    Ghost
282    Ghoul
Name: monster, dtype: object

I need to convert the string labels into int so that knn can preprocess the data. 

In [43]:
le_color = preprocessing.LabelEncoder()
le_type = preprocessing.LabelEncoder()
fTrain.color = le_color.fit_transform(fTrain.color)
cTrain = le_type.fit_transform(cTrain)
fTest.color = le_color.fit_transform(fTest.color)
cTest = le_type.fit_transform(cTest)

In [44]:
knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(fTrain,cTrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')

In [46]:
prediction = knn.predict(fTest)

In [47]:
metrics.accuracy_score(cTest,prediction)

0.68817204301075274