In [None]:
# working through chapter 04 of Python Machine Learning
# available from PACKT Publishing

# This chapter deals with data preprocessing (essentially munging)


In [6]:
# make an error-filled CSV
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0'''
csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [7]:
# ID the missing values
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [16]:
# test different ways to drop NAs
print df.dropna()
print df.dropna(axis=1)
print df.dropna(how='all')
print df.dropna(thresh=4)
print df.dropna(subset=['C'])

   A  B  C  D
0  1  2  3  4
    A   B
0   1   2
1   5   6
2  10  11
    A   B   C   D
0   1   2   3   4
1   5   6 NaN   8
2  10  11  12 NaN
   A  B  C  D
0  1  2  3  4
    A   B   C   D
0   1   2   3   4
2  10  11  12 NaN


In [21]:
# one approach to handling missing values is to 
# use mean imputation

from sklearn.preprocessing import Imputer
# axis=0 uses the column means
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

In [39]:
# work with categorical data
import pandas as pd
df = pd.DataFrame([
        ['green', 'M', 10.1, 'class1']
        , ['red', 'L', 13.5, 'class2']
        , ['blue', 'XL', 15.3, 'class1']
        ])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [40]:
# map categorical (ordinal) size to a continuous var using 
# a key : value store
size_mapping = {
                'XL' : 3,
                'L' : 2,
                'M' : 1
                }
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [41]:
# encode class labels
import numpy as np
class_mapping = {label:idx for idx,label in
                enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [42]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [50]:
#use one-hot encoding on nominal features 
from sklearn.preprocessing import LabelEncoder
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])

# stopping there would treat the colors as ordinal, when
# they are really just binaries
# update this

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [57]:
# read in a wine data set from the UC Irvine ML library
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol'
                    ,'Malic acid', 'Ash'
                    ,'Alcalinity of ash', 'Magnesium'
                    ,'Total phenols', 'Flavanoids'
                    , 'Nonflavanoid phenols'
                    , 'Proanthocyanins'
                    , 'Color intensity', 'Hue'
                    , 'OD280/OD315 of diluted wines'
                    , 'Proline'] 
print('Class labels', np.unique(df_wine['Class label']))
df_wine.head(n=10)

('Class labels', array([1, 2, 3], dtype=int64))


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
5,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,1,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,1,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,1,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


In [61]:
from sklearn.cross_validation import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y
                                                   , test_size=0.3
                                                   , random_state=0)



In [62]:
#implement Min-Max standardization
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [63]:
#implement Normalization standardization
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [71]:
# use L1 regularization to train a logistic regression
# with the just-created inputs
from sklearn.linear_model import LogisticRegression
print LogisticRegression(penalty='l1')

lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)
print('Training Accuracy: ', lr.score(X_test_std, y_test))

from sklearn.metrics import accuracy_score
y_valid = lr.predict(X_test_std)
print accuracy_score(y_valid, y_test)
#just checking


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
('Training Accuracy: ', 0.98148148148148151)


0.98148148148148151

In [79]:
# create an implementation of Sequential Backwards Selection
# for feature reduction

from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

class SBS():
    def __init__(self, estimator, k_features
                , scoring=accuracy_score
                 , test_size=0.25
                 , random_state=1
                ):
        self.scoring = scoring
        self.estimator = clone(estimator)
        self.k_features = k_features
        self.test_size = test_size
        self.random_state = random_state
    
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=self.test_size, random_state=self.random_state)
        dim = X_train.shape[1]
        self.indices = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]
        
        while dim > self.k_features:
            scores = []
            subsets = []
            
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(X_train, y_train
                                         , X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
                
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            
            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        
        return self
    
    def transform(self, X):
        return X[:, self.indices_]
    
    def _calc_score(self, X_train
                    , y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score
