In [74]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import scipy.stats as stats
from matplotlib import pyplot as plt
from pandas.tools.plotting import scatter_matrix
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [75]:
original = pd.read_csv('train.csv')
training = original.copy()
training.drop('id', axis = 1, inplace = True)

In [76]:
training.columns, training.shape

(Index([u'bone_length', u'rotting_flesh', u'hair_length', u'has_soul', u'color',
        u'type'],
       dtype='object'), (371, 6))

In [77]:
training.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [78]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 6 columns):
bone_length      371 non-null float64
rotting_flesh    371 non-null float64
hair_length      371 non-null float64
has_soul         371 non-null float64
color            371 non-null object
type             371 non-null object
dtypes: float64(4), object(2)
memory usage: 17.5+ KB


# Data fields - Info pulled from Kaggle

- bone_length - average length of bone in the creature, normalized between 0 and 1
- rotting_flesh - percentage of rotting flesh in the creature
- hair_length - average hair length, normalized between 0 and 1
- has_soul - percentage of soul in the creature
- color - dominant color of the creature: 'white','black','clear','blue','green','blood'
- type - target variable: 'Ghost', 'Goblin', and 'Ghoul'

## Type of Variable
    Predictor Variable (features)
       - bone_length
       - rotting_flesh
       - hair_length
       - has_soul
       - color
    Target Variable (target)
       - type

## Data Type
    Numeric
    - bone_length (BL)
    - rotting_flesh (RF)
    - hair_length (HL)
    - has_soul (HS)
    
    Character
    - color (C)
    - type (T)
    
## Variable Category
    Continuous
    - bone_length
    - rotting_flesh
    - hair_length
    - has_soul
    Categorical
    - color
    - type    

In [79]:
training.describe()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul
count,371.0,371.0,371.0,371.0
mean,0.43416,0.506848,0.529114,0.471392
std,0.132833,0.146358,0.169902,0.176129
min,0.061032,0.095687,0.1346,0.009402
25%,0.340006,0.414812,0.407428,0.348002
50%,0.434891,0.501552,0.538642,0.466372
75%,0.517223,0.603977,0.647244,0.60061
max,0.817001,0.932466,1.0,0.935721


In [80]:
training.corr()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul
bone_length,1.0,-0.041716,0.353881,0.381675
rotting_flesh,-0.041716,1.0,-0.220353,-0.132051
hair_length,0.353881,-0.220353,1.0,0.474835
has_soul,0.381675,-0.132051,0.474835,1.0


In [81]:
training_features = training[['bone_length','rotting_flesh','hair_length','has_soul','color']].copy()
training_type = training['type'].copy()

In [82]:
# help(StratifiedKFold)

In [83]:
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(training_features,training_type)

2

In [84]:
train_index, test_index = skf.split(training_features,training_type)
# print train_index[1]
# print test_index[0]

In [85]:
features_training1, class_training1 = training_features.ix[train_index[0].tolist()], training_type.ix[train_index[0].tolist()]
features_training2, class_training2 = training_features.ix[train_index[1].tolist()], training_type.ix[train_index[1].tolist()]

In [86]:
features_test1, class_test1 = training_features.ix[test_index[0].tolist()], training_type.ix[test_index[0].tolist()]
features_test2, class_test2 = training_features.ix[test_index[1].tolist()], training_type.ix[test_index[1].tolist()]

In [87]:
features_training2 = features_training2.drop(['color','hair_length'], axis = 1)
features_test2 = features_test2.drop(['color','hair_length'], axis = 1)
features_training2.columns

Index([u'bone_length', u'rotting_flesh', u'has_soul'], dtype='object')

In [88]:
lr1 = LogisticRegression()
lr2 = LogisticRegression()

In [89]:
from sklearn import preprocessing
lu_color = preprocessing.LabelEncoder()
features_training1.color = lu_color.fit_transform(features_training1.color)
features_test1.color = lu_color.fit_transform(features_test1.color)

In [90]:
lr1.fit(features_training1, class_training1),
lr2.fit(features_training2, class_training2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [91]:
prediction1 = lr1.predict(features_test1)
prediction2 = lr2.predict(features_test2)

In [93]:
metrics.accuracy_score(class_test1, prediction1)

0.71657754010695185

In [94]:
metrics.accuracy_score(class_test2, prediction2)

0.63586956521739135