In [1]:
# Examples from https://machinelearningmastery.com/metrics-evaluate-machine-learning-algorithms-python/

import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
names = ['preg', 'plas', 'pres', 'skin', 
         'test', 'mass', 'pedi', 'age', 'class']

### Read in the file __`data/pima-indians-diabetes.data.csv`__ and set the __`names`__ argument to the column names above

In [2]:
data = pd.read_csv('data/pima-indians-diabetes.data.csv')
data.columns = names

### Examine the data

In [3]:
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


### Put all of the rows of data for the first 8 columns into a variable __`X`__ and put all of the labels into __`Y`__ from the 9th column 

In [4]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X.head(10)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30
5,3,78,50,32,88,31.0,0.248,26
6,10,115,0,0,0,35.3,0.134,29
7,2,197,70,45,543,30.5,0.158,53
8,8,125,96,0,0,0.0,0.232,54
9,4,110,92,0,0,37.6,0.191,30


In [5]:
y.head(10)

0    0
1    1
2    0
3    1
4    0
5    1
6    0
7    1
8    1
9    0
Name: class, dtype: int64

### Create a KFold instance with 10 splits and a __`random_state`__of 7

In [6]:
kf = model_selection.KFold(n_splits=10, random_state=7)

### Execute the __`model_selection.cross_val_score()`__ function on a LogisticRegression instance with your __`X`__ and __`Y`__ data, the KFold instance and the *accuracy* scoring method

In [7]:
mod = LogisticRegression()
res = model_selection.cross_val_score(mod, X, y, cv=kf, scoring='accuracy')

In [8]:
res.mean(), res.std()

(0.766678058783322, 0.049985245693915575)

### Re-run the __`model_selection.cross_val_score()`__ method with a *precision* scoring

In [9]:
res = model_selection.cross_val_score(mod, X, y, cv=kf, scoring='precision')

### Print out the precision mean and std for the results

In [10]:
res.mean(), res.std()

(0.7183142433761629, 0.0889066870951792)

### Re-run the __`model_selection.cross_val_score()`__ method with *recall* scoring

In [11]:
res = model_selection.cross_val_score(mod, X, y, cv=kf, scoring='recall')

### Print out the recall mean and std for the results

In [12]:
res.mean(), res.std()

(0.5409077961748946, 0.05795547001375973)

### Execute the __`model_selection.cross_val_score()`__ function on a RandomForest instance with your __`X`__ and __`Y`__ data, the KFold instance and the *precision* scoring method

In [13]:
from sklearn.ensemble import RandomForestClassifier
mod = RandomForestClassifier()
res = model_selection.cross_val_score(mod, X, y, cv=kf, scoring='precision')

  from numpy.core.umath_tests import inner1d


### Print out the recall mean and std for the results

In [14]:
res.mean(), res.std()

(0.6737607501194457, 0.1122177795052304)

### Now we will use a __`confusion_matrix`__

In [15]:
from sklearn.metrics import confusion_matrix

### Create training and test data using __`model_selection.train_test_split()`__ with a test size of 0.33 and a random seed of 7
### Fit your training data to a LogisticRegression model and predict the results of your test values

In [16]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, y, test_size=0.33, random_state=7)
mod = LogisticRegression()
mod.fit(X_train, Y_train)
predicted = mod.predict(X_test)

### Compare the actual test data results with the predicted results by invoking a confusion matrix

In [17]:
mat = confusion_matrix(Y_test, predicted)
mat

array([[149,  20],
       [ 39,  46]])

### Now we are going to use a classification_report

In [18]:
from sklearn.metrics import classification_report

### Generate a classification report with the actual test data results and the predicted results by creating a __`classification_report`__

In [19]:
rep = classification_report(Y_test, predicted)
print(rep)

             precision    recall  f1-score   support

          0       0.79      0.88      0.83       169
          1       0.70      0.54      0.61        85

avg / total       0.76      0.77      0.76       254



### Regression models have different scoring options. Read the Boston housing data back in.

In [20]:
df = pd.read_csv('data/Boston.csv')

### Put the values for the DataFrame into an array (or use .iloc indexing on the DataFrame). Create your feature matrix from the first 12 columns. Create your target matrix from the 13th column.

In [21]:
X = df.iloc[:, :12]
y = df.iloc[:, 12]

### Create a LinearRegression model and generate a __`cross_val_score`__ with a scoring type of *neg_mean_absolute_error*

In [22]:
from sklearn.linear_model import LinearRegression

mod = LinearRegression()
res = model_selection.cross_val_score(mod, X, y, cv=kf, scoring='neg_mean_absolute_error')

### Print out the MAE mean and std for the results

In [23]:
res.mean(), res.std()

(-4.019835086410423, 1.8610307550593788)

### Create a LinearRegression model and generate a __`cross_val_score`__ with a scoring type of *neg_mean_squared_error*

In [24]:
mod = LinearRegression()
res = model_selection.cross_val_score(mod, X, y, cv=kf, scoring='neg_mean_squared_error')

### Print out the MSE mean and std for the results

In [25]:
res.mean(), res.std()

(-34.00650832111653, 40.507893991184936)