In [1]:
# What are features?

# Numerical Features: each sample contains a list of numbers corresponding to features
from sklearn.datasets import load_iris
iris=load_iris()
print(iris.data.shape)

(150L, 4L)


In [2]:
# Categorical Features ex: [red, blue, purple]

#Use DictVectorizer to encode categorical features

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.}
]

In [3]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
vec

DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [4]:
vec.fit_transform(measurements).toarray()

array([[  1.,   0.,   0.,  33.],
       [  0.,   1.,   0.,  12.],
       [  0.,   0.,   1.,  18.]])

In [5]:
vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']

In [7]:
# Combining Numerical and Categorical Features

# As an example, we will perform survival prediction for the passengers of the HMS Titanic

import os
import pandas as pd

titanic = pd.read_csv(os.path.join('datasets', 'titanic3.csv'))
print titanic.columns

Index([u'pclass', u'survived', u'name', u'sex', u'age', u'sibsp', u'parch',
       u'ticket', u'fare', u'cabin', u'embarked', u'boat', u'body',
       u'home.dest'],
      dtype='object')


In [8]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [9]:
labels = titanic.survived.values
features = titanic[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]

In [11]:
features.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,1.0,female,29.0,0.0,0.0,211.3375,S
1,1.0,male,0.9167,1.0,2.0,151.55,S
2,1.0,female,2.0,1.0,2.0,151.55,S
3,1.0,male,30.0,1.0,2.0,151.55,S
4,1.0,female,25.0,1.0,2.0,151.55,S


In [12]:
pd.get_dummies(features).head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1.0,29.0,0.0,0.0,211.3375,1.0,0.0,0.0,0.0,1.0
1,1.0,0.9167,1.0,2.0,151.55,0.0,1.0,0.0,0.0,1.0
2,1.0,2.0,1.0,2.0,151.55,1.0,0.0,0.0,0.0,1.0
3,1.0,30.0,1.0,2.0,151.55,0.0,1.0,0.0,0.0,1.0
4,1.0,25.0,1.0,2.0,151.55,1.0,0.0,0.0,0.0,1.0


In [13]:
features_dummies = pd.get_dummies(features, columns = ['pclass', 'sex', 'embarked'])
features_dummies.head(n=16)

Unnamed: 0,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,29.0,0.0,0.0,211.3375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.9167,1.0,2.0,151.55,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,2.0,1.0,2.0,151.55,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,30.0,1.0,2.0,151.55,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,25.0,1.0,2.0,151.55,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5,48.0,0.0,0.0,26.55,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6,63.0,1.0,0.0,77.9583,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7,39.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,53.0,2.0,0.0,51.4792,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
9,71.0,0.0,0.0,49.5042,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [14]:
data = features_dummies.values

In [15]:
import numpy as np
np.isnan(data).any()

True

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

train_data, test_data, train_labels, test_labels = train_test_split(data,
                                                                   labels,
                                                                   random_state=0)
imp = Imputer()
imp.fit(train_data)
train_data_finite = imp.transform(train_data)
test_data_finite = imp.transform(test_data)

In [20]:
from sklearn.dummy import DummyClassifier

clf = DummyClassifier('most_frequent')
clf.fit(train_data_finite, train_labels)
#clf.score(test_data_finite, test_labels)

ValueError: Can't handle mix of continuous and binary