In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import Lasso
import warnings
from operator import itemgetter
import pandas as pd

# Supervised Learning
## Classification 

### Cancer Dataset

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print("cancer.keys():\n", cancer.keys())



cancer.keys():
 dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [None]:
print("Shape of cancer data:", cancer.data.shape)

Shape of cancer data: (569, 30)


### Question 1
Print the counts for the different values of cancer.target

In [None]:
print("Sample counts per class:\n",
      {n: v for n, v in zip(cancer.target_names, np.bincount(cancer.target))})

Sample counts per class:
 {'malignant': 212, 'benign': 357}


In [None]:
print("Feature names:\n", cancer.feature_names)

Feature names:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


Different approach: Convert the Cancer dataset into a pandas dataframe

In [None]:
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns= np.append(cancer['feature_names'], ['target']))

In [None]:
df['target'].value_counts()

1.0    357
0.0    212
Name: target, dtype: int64

### Question 2
Create a test train split using cancer.data and cancer.target


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=66)


### Question 3
Create a KNeighborsClassifier for the above train and test data, with weights set to "distance", and 5 neighbors. Print the training and test accuracy. 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5,weights='distance')

knn.fit(X_train, y_train)

print('Decision KNeighborsClassifier, Cancer dataset, weights set to "distance", and 5 neighbors')
print('Accuracy on the training set: {:.3f}'.format(knn.score(X_train, y_train)))
print('Accuracy on test set: {:.3f}'.format(knn.score(X_test, y_test)))  

Decision KNeighborsClassifier, Cancer dataset, weights set to "distance", and 5 neighbors
Accuracy on the training set: 1.000
Accuracy on test set: 0.930


### Question 4
Create a loop that builds kNN classifiers with either distance or uniform weighting, with numbers of neighbors varying between 1 and 20. What is the best combination? Produce a list consisting of test accuracy, training accuracy, number of neighbors and weighting choice. The list should be sorted by test accuracy.

In [None]:
#using lists
res = []
wei=['uniform', 'distance']
for w in wei:
  for n in range (1,21):
      kn = KNeighborsClassifier(n_neighbors=n, weights=w)
      kn.fit(X_train,y_train)
      trainRes = kn.score(X_train,y_train)
      testRes = kn.score(X_test,y_test)
      res.append((testRes,trainRes,w,n))
    

for r in sorted(res,key=lambda e:e[0], reverse=True):   
    print("{} neighbors  Train: {}  Test: {}  Weight: {}  ".format(r[3],r[1],r[0],r[2]))

In [None]:
#using dictionaries 
training_accuracy = dict()

test_accuracy = dict()

# try n_neighbors from 1 to 10
neighbors_settings = range(1, 21)

for n_neighbors in neighbors_settings:
    # record the number of neighbors
    for w in ['distance', 'uniform']:
        # build and fit the model
        key = str(n_neighbors) + " " + w
        clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=w).fit(X_train, y_train)
    
        # record training set accuracy
        value = clf.score(X_train, y_train)
        training_accuracy[key] = value
    
        # record test set accuracy
        value = clf.score(X_test, y_test)
        test_accuracy[key] = value
print("test   train     k  weighting")
for key in sorted(test_accuracy, key=test_accuracy.get, reverse=True):
    print("{:.4f}, {:.4f}, {}"
          .format(test_accuracy[key], training_accuracy[key], key ))

test   train     k  weighting
0.9371, 0.9460, 6 uniform
0.9301, 1.0000, 5 distance
0.9301, 1.0000, 6 distance
0.9301, 1.0000, 7 distance
0.9301, 0.9437, 7 uniform
0.9301, 0.9413, 8 uniform
0.9231, 1.0000, 3 distance
0.9231, 0.9577, 3 uniform
0.9231, 0.9554, 4 uniform
0.9231, 0.9484, 5 uniform
0.9231, 1.0000, 8 distance
0.9231, 1.0000, 9 distance
0.9231, 1.0000, 13 distance
0.9231, 0.9366, 15 uniform
0.9231, 1.0000, 16 distance
0.9231, 0.9390, 16 uniform
0.9231, 1.0000, 17 distance
0.9231, 0.9413, 17 uniform
0.9231, 1.0000, 18 distance
0.9231, 0.9413, 18 uniform
0.9231, 1.0000, 19 distance
0.9231, 0.9390, 19 uniform
0.9231, 1.0000, 20 distance
0.9231, 0.9390, 20 uniform
0.9161, 1.0000, 4 distance
0.9161, 0.9343, 9 uniform
0.9161, 1.0000, 10 distance
0.9161, 0.9390, 10 uniform
0.9161, 1.0000, 11 distance
0.9161, 0.9343, 11 uniform
0.9161, 1.0000, 12 distance
0.9161, 0.9343, 12 uniform
0.9161, 0.9413, 13 uniform
0.9161, 1.0000, 14 distance
0.9161, 0.9413, 14 uniform
0.9161, 1.0000, 15 dis

## Linear models for classification

### Question 5
Produce a LogisticRegression classifier with default settings for the cancer dataset. Print the training and test accuracy.

In [None]:
warnings.filterwarnings('ignore')
# Create the Logistic Regression object
logireg = LogisticRegression()

# Train the model using the training sets
logireg.fit(X_train, y_train)

print('Breast cancer dataset')
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logireg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logireg.score(X_test, y_test)))

Breast cancer dataset
Accuracy of Logistic regression classifier on training set: 0.96
Accuracy of Logistic regression classifier on test set: 0.94


### Question 6
Print the three features with the highest coefficients, and the three features with the lowest coefficients. On each line, print the feature name, followed by its coefficient value.


In [None]:
coefs=logireg.coef_[0]

sorted_coefs = sorted ((zip(cancer.feature_names, coefs)),
                key = lambda e:e[1], reverse=True)
print("Three features with the lowest coefficients: " + str(sorted_coefs[-3:]))
print("\nThree features with the highest coefficients: "+ str(sorted_coefs[:3]))

Three features with the lowest coefficients: [('worst texture', -0.47653394004370375), ('worst compactness', -0.9714036132324955), ('worst concavity', -1.1744126447444956)]

Three features with the highest coefficients: [('worst radius', 1.7574383180186872), ('mean radius', 1.6486457546589277), ('texture error', 0.5986920401108508)]


### Question 7
Try different values of C for both Logistic Regression and LinearSVC. Give results sorted by test accuracy. Each line of the output should include test accuracy, training accuracy, model (Logistic Regression or LinearSVC), and C value. At which values is there underfitting? At which values is there overfitting? Explain. 

In [None]:
print("Accuracy of classifier on different values of C for both Logistic Regression and LinearSVC:\n")

for c_value in [0.1, 1, 0.01]:
    clf = LogisticRegression(C=c_value).fit(X_train, y_train)
    
    print("Logistic regression classifier accuracy test set: {:.2f}, Training Set: {:.2f}, C = {:.3f}"
          .format(clf.score(X_test, y_test), clf.score(X_train, y_train), c_value))
    
    clf = LinearSVC(C=c_value).fit(X_train, y_train)
    
    print("Linear SVC classifier accuracy test set: {:.2f}, Training Set: {:.2f}, C = {:.3f}\n"
          .format(clf.score(X_test, y_test), clf.score(X_train, y_train), c_value))

Accuracy of classifier on different values of C for both Logistic Regression and LinearSVC:

Logistic regression classifier accuracy test set: 0.92, Training Set: 0.96, C = 0.100
Linear SVC classifier accuracy test set: 0.92, Training Set: 0.93, C = 0.100

Logistic regression classifier accuracy test set: 0.94, Training Set: 0.96, C = 1.000
Linear SVC classifier accuracy test set: 0.93, Training Set: 0.93, C = 1.000

Logistic regression classifier accuracy test set: 0.92, Training Set: 0.94, C = 0.010
Linear SVC classifier accuracy test set: 0.90, Training Set: 0.88, C = 0.010



Using a nested for-loop to iterate over the different models and values for C

In [None]:
scores = []
for model in ['log', 'SVC']:
    for c in [0.01, 0.1, 1, 10,100]:
        if model=='log':
            clf = LogisticRegression(C=c).fit(X_train, y_train)
        if model=='SVC':
            clf = LinearSVC(C=c).fit(X_train, y_train)
        
        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append([model, c, train_score, test_score])

In [None]:
scores = sorted(scores, key=(lambda x: x[3]), reverse=True)
for score in scores:
    print(f"Model: {score[0]}, C-value: {score[1]} Train accuracy: {score[2]}, Test accuracy: {score[3]}")

Model: log, C-value: 1 Train accuracy: 0.960093896713615, Test accuracy: 0.9440559440559441
Model: log, C-value: 10 Train accuracy: 0.9671361502347418, Test accuracy: 0.9370629370629371
Model: log, C-value: 100 Train accuracy: 0.9671361502347418, Test accuracy: 0.9370629370629371
Model: SVC, C-value: 0.1 Train accuracy: 0.931924882629108, Test accuracy: 0.9370629370629371
Model: log, C-value: 0.1 Train accuracy: 0.9553990610328639, Test accuracy: 0.9230769230769231
Model: log, C-value: 0.01 Train accuracy: 0.9389671361502347, Test accuracy: 0.916083916083916
Model: SVC, C-value: 100 Train accuracy: 0.9366197183098591, Test accuracy: 0.916083916083916
Model: SVC, C-value: 0.01 Train accuracy: 0.8708920187793427, Test accuracy: 0.8951048951048951
Model: SVC, C-value: 1 Train accuracy: 0.863849765258216, Test accuracy: 0.8601398601398601
Model: SVC, C-value: 10 Train accuracy: 0.8427230046948356, Test accuracy: 0.8251748251748252


###  Regression

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
print("Data shape:", boston.data.shape)

Data shape: (506, 13)


In [None]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [None]:
print(boston.DESCR[:1400])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

### Question 8
Build a LinearRegression model using the Boston dataset. 

In [None]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)

linreg = LinearRegression().fit(X_train, y_train)
    


### Question 9
Which features have the highest coefficients in the above model? Which features have the lowest coefficients? Does this make intuitive sense, in terms of what you think would most influence house prices? Explain.

In [None]:
sorted_coefs = sorted (list(zip(list(boston.feature_names), linreg.coef_)),
                key = lambda e: e[1], reverse=True)

print('Features with the highest coefficient is: {} with a value of: {:.3f}'.format(sorted_coefs[0][0], sorted_coefs[0][1]))
print('Features with the lowest coefficient is: {} with a value of: {:.3f}'.format(sorted_coefs[-1][0], sorted_coefs[-1][1]))

Features with the highest coefficient is: RM with a value of: 3.769
Features with the lowest coefficient is: NOX with a value of: -15.589


### Question 10
Build Lasso Regression models for the Boston dataset. Try at least 10 different values for the alpha parameter (you may also have to change max_iter). What is the best model? For this model, give the alpha value, and the training and test accuracy. 

In [None]:
for alpha in [0, 0.1,0.5, 1, 2, 4, 6, 5, 10, 20, 30, 50]:
    linlasso = Lasso(alpha, max_iter = 10000).fit(X_train, y_train)
    r2_train = linlasso.score(X_train, y_train)
    r2_test = linlasso.score(X_test, y_test)
    
    print('Alpha = {:.2f}\nFeatures kept: {}, r-squared training: {:.2f}, \
r-squared test: {:.2f}\n'
         .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))


Alpha = 0.00
Features kept: 13, r-squared training: 0.77, r-squared test: 0.64

Alpha = 0.10
Features kept: 12, r-squared training: 0.76, r-squared test: 0.61

Alpha = 0.50
Features kept: 11, r-squared training: 0.75, r-squared test: 0.59

Alpha = 1.00
Features kept: 11, r-squared training: 0.72, r-squared test: 0.55

Alpha = 2.00
Features kept: 9, r-squared training: 0.66, r-squared test: 0.49

Alpha = 4.00
Features kept: 6, r-squared training: 0.61, r-squared test: 0.46

Alpha = 6.00
Features kept: 5, r-squared training: 0.59, r-squared test: 0.44

Alpha = 5.00
Features kept: 5, r-squared training: 0.60, r-squared test: 0.44

Alpha = 10.00
Features kept: 4, r-squared training: 0.56, r-squared test: 0.40

Alpha = 20.00
Features kept: 4, r-squared training: 0.46, r-squared test: 0.30

Alpha = 30.00
Features kept: 4, r-squared training: 0.32, r-squared test: 0.14

Alpha = 50.00
Features kept: 3, r-squared training: 0.28, r-squared test: 0.12

