# Volume 3: Sklearn Guide
    Matthew Schaelling
    Math 406
    March 1, 2018

In [54]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.datasets import load_iris, load_breast_cancer
import sklearn.model_selection as ms 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import stats
import pandas as pd

## Problem 1

Take your Naive Bayes classifier from your homework and rewrite it as a class that inherits from `BaseEstimator` and `ClassifierMixin`.
Implement `__init__()`, `fit()`, and `predict()` in a way that matches `sklearn` conventions.

Test your model on the iris dataset.

In [10]:
class NaiveBayes(BaseEstimator, ClassifierMixin):
    """A simple naive bayes"""
    
    def __init__(self):
        self.mu = []
        self.sigma = []
    
    def fit(self, X, y):
        '''
        this function accepts training data x and y
        then creates a new function that will classify
        any new data x. Assumes each feature is normally
        distributed.

        (n - number of observations)
        (d - number of features)
        
        Arguments:
        x is an nxd array of normally distributed features
        y is an nx1 array of integer class labels starting at 0
        '''
        
        # iterate through labels and calculate the mean and
        # standard deviations for each
        self.labels = np.unique(y)
        self.label_probs = []
        for label in self.labels:
            x_temp = X[y==label]
            self.mu.append([])
            self.sigma.append([])
            for i in range(len(x_temp[0])):
                self.mu[-1].append(x_temp[:,i].mean())
                self.sigma[-1].append(x_temp[:,i].std())
            self.label_probs.append((y==label).mean())
        self.label_probs = np.array(self.label_probs)
        
        return self
    
    def predict(self, X):
        """Build classifying function and predict classes"""
        # create classifying fuction
        def classifier(Xtest):
            yhat = []
            for x in Xtest:
                probabilities = []
                for label in self.labels:
                    logprob = 0
                    for k in range(len(x)):
                        logprob += np.log(stats.norm.pdf(x[k],self.mu[label][k], 
                                                         self.sigma[label][k]))
                    probabilities.append(np.exp(logprob)*self.label_probs[label])
                yhat.append(np.argmax(probabilities))
            return np.array(yhat)
        
        self.classifier = classifier
        return self.classifier(X)

In [14]:
iris = load_iris()
Xtrain, Xtest, ytrain, ytest = ms.train_test_split(iris.data, iris.target)

naivebayes = NaiveBayes().fit(Xtrain, ytrain)
print("First 6 predictions on test set")
print(naivebayes.predict(Xtest)[:6])
print("First 6 labels on test set")
print(ytest[:6])
print("\nAccuracy: {}".format(naivebayes.score(Xtest, ytest)))

First 6 predictions on test set
[1 0 1 2 0 2]
First 6 labels on test set
[1 0 1 2 0 2]

Accuracy: 0.9736842105263158


## Problem 2

Write a transformer class where the `fit()` and `transform()` methods takes in $X$ as a pandas Data Frame.
For each numerical column, replace any `nan` entries with the mean of the column.
Drop string columns.
Return the data as a NumPy array.

In [35]:
class CleaningTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.x = X._get_numeric_data()
        return self
    
    def transform(self, X):
        for col in self.x:
            X[col].fillna(self.x[col].mean(), inplace=True)
        return X.select_dtypes(exclude=['object']).values

In [39]:
df = pd.DataFrame({'numbers':[1,2,3,np.nan,4,5], 
                   'more_numbers':[np.nan, .1,.9,-.3,.5,-.1], 
                   'fruits':['apple','banana','clementine','date','elderberry','fig']})
print("BEFORE PROCESSING\n")
print(df)
ct = CleaningTransformer()
ct.fit(df)
ct.transform(df)

BEFORE PROCESSING

       fruits  more_numbers  numbers
0       apple           NaN      1.0
1      banana           0.1      2.0
2  clementine           0.9      3.0
3        date          -0.3      NaN
4  elderberry           0.5      4.0
5         fig          -0.1      5.0


array([[ 0.22,  1.  ],
       [ 0.1 ,  2.  ],
       [ 0.9 ,  3.  ],
       [-0.3 ,  3.  ],
       [ 0.5 ,  4.  ],
       [-0.1 ,  5.  ]])

## Problem 3

Use `cross_validate()` to score your class from Problem 1 on the iris dataset.
Do the same for a `LogisticRegressionClassifier`.

In [41]:
ms.cross_validate(naivebayes, iris.data, iris.target, cv=5)



{'fit_time': array([0.00057769, 0.00049686, 0.00052285, 0.00048375, 0.0005157 ]),
 'score_time': array([0.03569698, 0.0329442 , 0.0294857 , 0.0296936 , 0.02948833]),
 'test_score': array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 1.        ]),
 'train_score': array([0.96666667, 0.95833333, 0.95833333, 0.96666667, 0.95833333])}

In [43]:
logit = LogisticRegression()
ms.cross_validate(logit, iris.data, iris.target, cv=5)



{'fit_time': array([0.015239  , 0.00067544, 0.00060678, 0.00061584, 0.0006032 ]),
 'score_time': array([0.00157142, 0.00016832, 0.00016356, 0.0001626 , 0.00016594]),
 'test_score': array([1.        , 0.96666667, 0.93333333, 0.9       , 1.        ]),
 'train_score': array([0.95      , 0.96666667, 0.96666667, 0.975     , 0.95833333])}

## Problem 4

Take the cancer data set (`datasets.load_breast_cancer()`) and do a grid search on an SVM (`sklearn.linear.svm`) with the parameter `C` as .01, .1, or 1, and the parameter `kernel` as `"linear"`, `"poly"`, `"rbf"`, and `"sigmoid"`.

What is the best choice of parameters?
How well does the corresponding model do?

In [52]:
svc = SVC()
clf = ms.GridSearchCV(svc, {'C':[.01,.1,1], 
                            'kernel':['linear','poly','rbf','sigmoid']},
                      n_jobs=6)
clf.fit(iris.data, iris.target)
print("Best Features:\t{}".format(clf.best_params_))
print("Best Score:\t{}".format(clf.best_score_))

Best Features:	{'C': 1, 'kernel': 'linear'}
Best Score:	0.98


It looks like the model does pretty well, 98% accuracy!

## Problem 5

Make a pipeline of your transformer from Problem 2, a normalizing scaler transformer (`preprocessing.StandardScaler`), a PCA transformer (`decomposition.PCA`), and an SVM classifier (`svm.SVC`).
Using the titanic dataset (read in as a pandas DataFrame), do a grid search for the best model, varying your parameters however you see fit.

What is your best choice of parameters?
How well does the corresponding model do?

**Extra credit** to the student with the very best model!
To compete, pick your best parameters, do a cross validation with 10 folds, and take the average of the test scores.

In [56]:
ct = CleaningTransformer()
scaler = StandardScaler()
pca = PCA()
svc = SVC()
pipe = Pipeline(steps=[('clean',ct), 
                       ('scale', scaler),
                       ('pca', pca),
                       ('svc',svc)])

titanic = pd.read_csv('titanic.csv')

In [63]:
estimator = GridSearchCV(pipe,{'pca__n_components':[2,4,6],
                               'svc__C':[.01,.1,1], 
                               'svc__kernel':['linear','poly','rbf','sigmoid']})
estimator.fit(titanic.drop(['Survived'],axis=1), titanic.Survived)
print("Best Features:\t{}".format(estimator.best_params_))
print("Best Score:\t{}".format(estimator.best_score_))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').