In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import Lasso
import warnings
from operator import itemgetter

# Supervised Learning
## Classification 

### Cancer Dataset

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer(as_frame=True)
print("cancer.keys():\n", cancer.keys())

cancer.keys():
 dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
print("Shape of cancer data:", cancer.data.shape)

Shape of cancer data: (569, 30)


In [4]:
print("Feature names:\n", cancer.feature_names)

Feature names:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


### Question 1
Print the counts for the different values of cancer.target

In [5]:
import pandas as pd
cancer['target'].value_counts()

1    357
0    212
Name: target, dtype: int64

### Question 2
Create a test train split using cancer.data and cancer.target


In [6]:
X_train, X_test, y_train, y_test = train_test_split(cancer['data'], cancer['target'], stratify=cancer.target)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(426, 30) (143, 30) (426,) (143,)


### Question 3
Create a KNeighborsClassifier for the above train and test data, with weights set to "distance", and 5 neighbors. Print the training and test accuracy. 

In [7]:
knn = KNeighborsClassifier(n_neighbors = 5, weights = 'distance')
knn.fit(X_train, y_train)
print('Training acc.: {} Test acc.: {}'.format((knn.score(X_train, y_train)), (knn.score(X_test, y_test))))

Training acc.: 1.0 Test acc.: 0.9230769230769231


### Question 4
Create a loop that builds kNN classifiers with either distance or uniform weighting, with numbers of neighbors varying between 1 and 20. What is the best combination? Produce a list consisting of test accuracy, training accuracy, number of neighbors and weighting choice. The list should be sorted by test accuracy.

In [8]:
neighbours = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] 
distances = ['uniform', 'distance']
accuracy_output = []
for neighbour in neighbours: 
    for distance in distances: 
        knn = KNeighborsClassifier(n_neighbors=neighbour) 
        knn.fit(X_train, y_train)
        training = knn.score(X_train, y_train)
        tests = knn.score(X_test, y_test)
        accuracy_output.append((neighbour, distance, tests, training))
for key in sorted (accuracy_output, reverse=True, key=lambda e:e[2]):
    print(f" # of neighbours {key[0]}, weight used {key[1]}, test accuracy: {key[2]}, train accuracy: {key[3]} \n")
    #print(test_results[key], train_results[key])

 # of neighbours 3, weight used uniform, test accuracy: 0.9440559440559441, train accuracy: 0.9553990610328639 

 # of neighbours 3, weight used distance, test accuracy: 0.9440559440559441, train accuracy: 0.9553990610328639 

 # of neighbours 5, weight used uniform, test accuracy: 0.9230769230769231, train accuracy: 0.9553990610328639 

 # of neighbours 5, weight used distance, test accuracy: 0.9230769230769231, train accuracy: 0.9553990610328639 

 # of neighbours 7, weight used uniform, test accuracy: 0.9230769230769231, train accuracy: 0.9436619718309859 

 # of neighbours 7, weight used distance, test accuracy: 0.9230769230769231, train accuracy: 0.9436619718309859 

 # of neighbours 8, weight used uniform, test accuracy: 0.9230769230769231, train accuracy: 0.9460093896713615 

 # of neighbours 8, weight used distance, test accuracy: 0.9230769230769231, train accuracy: 0.9460093896713615 

 # of neighbours 9, weight used uniform, test accuracy: 0.9230769230769231, train accuracy: 

## Linear models for classification

### Question 5
Produce a LogisticRegression classifier with default settings for the cancer dataset. Print the training and test accuracy.

In [9]:
#getting an error, I don't know what the issue is
logreg = LogisticRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

AttributeError: 'str' object has no attribute 'decode'

### Question 6
Print the three features with the highest coefficients, and the three features with the lowest coefficients. On each line, print the feature name, followed by its coefficient value.


### Question 7
Try different values of C for both Logistic Regression and LinearSVC. Give results sorted by test accuracy. Each line of the output should include test accuracy, training accuracy, model (Logistic Regression or LinearSVC), and C value. At which values is there underfitting? At which values is there overfitting? Explain. 

###  Regression

In [10]:
from sklearn.datasets import load_boston
boston = load_boston()
print("Data shape:", boston.data.shape)

Data shape: (506, 13)


In [11]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [12]:
print(boston.DESCR[:1400])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

### Question 8
Build a LinearRegression model using the Boston dataset. 

In [13]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)
linreg = LinearRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(linreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(linreg.score(X_test, y_test)))


Training set score: 0.712
Test set score: 0.827


### Question 9
Which features have the highest coefficients in the above model? Which features have the lowest coefficients? Does this make intuitive sense, in terms of what you think would most influence house prices? Explain.

### Question 10
Build Lasso Regression models for the Boston dataset. Try at least 10 different values for the alpha parameter (you may also have to change max_iter). What is the best model? For this model, give the alpha value, and the training and test accuracy. 

In [14]:
lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))

Training set score: 0.65
Test set score: 0.77
