In [2]:
# Dealing with missing data

In [25]:
import pandas as pd
from io import StringIO

csv_data = '''
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
'''

df = pd.read_csv(StringIO(csv_data))
df

In [26]:
df.isnull().sum()

In [27]:
# Get numpy array from db
df.values

In [28]:
df.dropna(axis=0)
# df.dropna(axis=1)

# only drop rows where all columns are NaN
# df.dropna(how='all')

# drop rows that have fewer than 4 real values
# df.dropna(thresh=4)

# only drop rows where NaN appear in specific columns
# df.dropna(subset=['C'])

In [29]:
# Imputing missing values

from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

In [30]:
# alternative to SimpleImputer
df.fillna(df.mean())

In [31]:
# Handling categorical data

import pandas as pd
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']
])
df.columns = ['color', 'size', 'price', 'classlabel']
df

In [32]:
# To make sure that the learning algo interprets the ordinal features correctly, we need to convert the categorical str values into integers

size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1,
}

df['size'] = df['size'].map(size_mapping)
df

In [33]:
# To reverse back

inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

In [34]:
# Encoding class labels

import numpy as np

class_mapping = {
    label: idx
    for idx, label in enumerate(np.unique(df['classlabel']))
}
class_mapping

In [35]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

In [36]:
# Reverse class labels
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

In [37]:
# Scikit way
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()

y = class_le.fit_transform(df['classlabel'].values)
y

In [38]:
class_le.inverse_transform(y)

In [41]:
# The values in the color column could be seen as ordered which would make the model worse
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()

X[:, 0] = color_le.fit_transform(X[:, 0])
X

In [42]:
# Scikit one-hot encoding on nominal features
from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
# applying only to first column
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

In [43]:
# To selectively transform columns
from sklearn.compose import ColumnTransformer

X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1, 2])
])
c_transf.fit_transform(X).astype(float)

In [44]:
# pandas way
pd.get_dummies(df[['price', 'color', 'size']])

In [45]:
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

In [46]:
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),
    ('nothing', 'passthrough', [1, 2]),
])
c_transf.fit_transform(X).astype(float)

In [47]:
# Encoding ordinal features
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

In [49]:
df['x > M'] = df['size'].apply(
    lambda x: 1 if x in {'L', 'XL'} else 0
)
df['x > L'] = df['size'].apply(
    lambda x: 1 if x == 'XL' else 0
)
del df['size']
df

In [51]:
# Wine dataset

df_wine = pd.read_csv(
    'https://archive.ics.uci.edu/'
    'ml/machine-learning-databases/'
    'wine/wine.data',
    header=None
)
df_wine.columns = [
    'Class label', 
    'Alcohol',
    'Malic acid', 
    'Ash',
    'Alcalinity of ash', 
    'Magnesium',
    'Total phenols', 
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity', 
    'Hue',
    'OD280/OD315 of diluted wines',
    'Proline'
]
print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

In [52]:
# Randomly partition dataset into separate test and training datasets

from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=0,
    stratify=y
)

In [53]:
# Normalization
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.fit_transform(X_test)

In [54]:
ex = np.array([0, 1, 2, 3, 4, 5])
print('standardized:', (ex - ex.mean()) / ex.std())
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))

In [55]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [57]:
from sklearn.linear_model import LogisticRegression

LogisticRegression(
    penalty='l1',
    solver='liblinear',
    multi_class='ovr'
)

lr = LogisticRegression(
    penalty='l1',
    C=1.0,
    solver='liblinear',
    multi_class='ovr'
)
lr.fit(X_train_std, y_train)

print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

In [58]:
lr.intercept_

In [60]:
import matplotlib.pyplot as plt

fig = plt.figure()
ax = plt.subplot(111)
colors = [
    'blue', 'green', 'red', 'cyan',
    'magenta', 'yellow', 'black',
    'pink', 'lightgreen', 'lightblue',
    'gray', 'indigo', 'orange'
]
weights, params = [], []

for c in np.arange(-4., 6.):
    lr = LogisticRegression(
        penalty='l1',
        C=10. ** c,
        solver='liblinear',
        multi_class='ovr',
        random_state=0
    )
    lr.fit(X_train_std, y_train)
    weights.append(lr.coef_[1])
    params.append(10 ** c)

weights = np.array(weights)
for column, color in zip(range(weights.shape[1]), colors):
    plt.plot(
        params,
        weights[:, column],
        label=df_wine.columns[column + 1],
        color=color
    )

plt.axhline(0, color='black', linestyle='--', linewidth=3)
plt.xlim([10 ** (-5), 10 ** 5])
plt.ylabel('Weight coefficient')
plt.xlabel('C (inverse regularization strength)')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(
    loc='upper center',
    bbox_to_anchor=(1.38, 1.03),
    ncol=1,
    fancybox=True
)
plt.show()

In [61]:
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


class SBS:
    def __init__(
            self, 
            estimator, 
            k_features, 
            scoring=accuracy_score,
            test_size=0.25,
            random_state=1
    ):
        self.scoring = scoring
        self.estimator = clone(estimator)
        self.k_features = k_features
        self.test_size = test_size
        self.random_state = random_state
    
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=self.test_size,
            random_state=self.random_state
        )
        
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(
            X_train, y_train, 
            X_test, y_test, 
            self.indices_
        )
        self.scores_ = [score]
        
        while dim > self.k_features:
            scores = []
            subsets = []
            
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(
                    X_train, y_train,
                    X_test, y_test,
                    p
                )
                scores.append(score)
                subsets.append(p)
            
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            
            self.scores_.append(scores[best])
            
        self.k_score_ = self.scores_[-1]
        return self
    
    def transform(self, X):
        return X[:, self.indices_]
    
    def _calc_score(
            self,
            X_train, y_train,
            X_test, y_test,
            indices
    ):
        self.estimator.fit(X_train[: ,indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

In [62]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)

k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.02])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.tight_layout()
plt.show()

In [63]:
# smallest feature subset (k=3) which yielded good performance

k3 = list(sbs.subsets_[10])
print(df_wine.columns[1:][k3])

In [64]:
# all features
knn.fit(X_train_std, y_train)
print('Training accuracy:', knn.score(X_train_std, y_train))
print('Test accuracy:', knn.score(X_test_std, y_test))

In [65]:
# 3 features
knn.fit(X_train_std[:, k3], y_train)
print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))
print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))

In [66]:
# Assessing feature importance with random forests

from sklearn.ensemble import RandomForestClassifier

feat_labels = df_wine.columns[1:]
forest = RandomForestClassifier(
    n_estimators=500,
    random_state=1
)
forest.fit(X_train, y_train)

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (
        f + 1,
        30,
        feat_labels[indices[f]],
        importances[indices[f]]
    ))

plt.title('Feature importance')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()

In [67]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print(
    'Number of features that meet this threshold',
    'criterion:',
    X_selected.shape[1]
)
for f in range(X_selected.shape[1]):
    print("%2d) %-*s %f" % (
        f + 1,
        30,
        feat_labels[indices[f]],
        importances[indices[f]]
    ))