<a href="https://colab.research.google.com/github/miyoko-shimura/Udacity-Introduction-To-Machine-Learning/blob/master/AWS_01_exercise_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise: Linear Models

In this exercise, we'll be exploring two types of linear models, one regression, one classification. While regression is what you typically think of for a linear model, they can also be used effectively in classification problems.

You're tasked with compeleting the following steps:
1. Load in the wine dataset from scikit learn.
2. For the wine dataset, create a train and test split, 80% train / 20% test.
3. Create a LogisticRegression model with these hyper parameters: random_state=0, max_iter=10000
4. Evaluate the model with the test dataset
5. Load the diabetes dataset from scikit learn
6. For the Diabetes dataset, create a train and test split, 80% train / 20% test.
7. Create a SGDRegressor model model with these hyper parameters: random_state=0, max_iter=10000
8. Evaluate the model with the test dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDRegressor

## Linear Classifier

In [None]:
# Load in the wine dataset
wine = datasets.load_wine()

In [None]:
# Create the wine `data` dataset as a dataframe and name the columns with `feature_names`
df = pd.DataFrame(wine["data"], columns=wine["feature_names"])

# Include the target as well
df["target"] = wine["target"]

In [None]:
# Check your dataframe by `.head()`
print(df.head())

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  target  
0          

In [None]:
# Split your data with these ratios: train: 0.8 | test: 0.2
df_train, df_test = df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model

clf = LogisticRegression(random_state=0, max_iter=10000).fit(
    df_train.loc[:, df_train.columns != "target"], df_train["target"]
)
clf.score(df_test.loc[:, df_test.columns != "target"], df_test["target"])

0.9722222222222222

## Linear Regression

In [None]:
# Load in the diabetes dataset
diabetes = datasets.load_diabetes()

In [None]:
# Create the diabetes `data` dataset as a dataframe and name the columns with `feature_names`
dfd = pd.DataFrame(diabetes["data"], columns=diabetes["feature_names"])

# Include the target as well
dfd['target'] = diabetes["target"]

In [None]:
# Check your dataframe by `.head()`
print(dfd.head())

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [None]:
# Split your data with these ratios: train: 0.8 | test: 0.2
dfd_train, dfd_test = train_test_split(dfd, test_size=0.2, random_state=0)

In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model

reg = SGDRegressor(random_state=0, max_iter=10000).fit(
    dfd_train.loc[:, dfd_train.columns != "target"], dfd_train["target"]
)
reg.score(dfd_test.loc[:, dfd_test.columns != "target"], dfd_test["target"])

0.3484895912801911

# Machine Learning Concepts

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets

In [None]:
# Load in the iris dataset
iris = datasets.load_iris()

In [None]:
# Create dataframe with feature names
df = pd.DataFrame(iris["data"], columns=iris["feature_names"])

# Include target column
df['target'] = iris.target

In [None]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
# Target values as an array to compare against supervised and unsupervised
df["target"].to_numpy()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Supervised ML

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# initialize and fit a linear regression model
reg = LinearRegression().fit(df[iris["feature_names"]], df["target"])

In [None]:
# Scoring of the linear regression model, but slighly deceiving since the iris dataset is classifying not regression
reg.score(df[iris["feature_names"]], df["target"])

0.9303939218549564

In [None]:
# regression output floating point numbers
reg.predict(df[iris["feature_names"]])

array([-8.25493616e-02, -4.01284476e-02, -4.86276768e-02,  1.22998627e-02,
       -7.53667248e-02,  5.82910066e-02,  3.83367194e-02, -4.44863248e-02,
        1.98324281e-02, -8.21970989e-02, -1.01272512e-01,  7.59348686e-04,
       -8.98630676e-02, -1.02503649e-01, -2.26652208e-01, -4.10494982e-02,
       -3.31670043e-02, -2.16241562e-02, -3.21980063e-02, -1.07834994e-02,
       -4.35196609e-02,  5.41496547e-02, -1.22062394e-01,  1.76835660e-01,
        6.93528569e-02, -5.59002750e-03,  1.00228589e-01, -7.08754443e-02,
       -8.97319983e-02,  1.99658314e-02,  1.27831946e-02,  3.26017444e-02,
       -1.55848342e-01, -1.55367344e-01, -2.12718935e-02, -1.05063936e-01,
       -1.50176206e-01, -1.25101345e-01, -7.04002332e-03, -5.56769102e-02,
       -3.32980735e-02,  7.07502372e-02, -1.50559206e-02,  2.18071051e-01,
        1.41599717e-01,  3.19873432e-02, -4.88442021e-02, -1.45725887e-02,
       -9.00819270e-02, -6.33428789e-02,  1.20248442e+00,  1.28482413e+00,
        1.32433716e+00,  

## Unsupervised ML

In [None]:
from sklearn.cluster import KMeans

In [None]:
# We already know the number of clusters, we can use during fit
kmeans = KMeans(n_clusters=3, random_state=0).fit(df[iris["feature_names"]])



In [None]:
# Print the labels to see what value is in what cluster
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0], dtype=int32)

In [None]:
# Print the labels to see what value is in what cluster
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0], dtype=int32)