In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Task

Your task is to build the best model possible using [this dataset](https://archive.ics.uci.edu/ml/datasets/abalone). Your goal is to predict the Sex.

In [3]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [5]:
#load data
df = pd.read_csv('/content/drive/MyDrive/Data/abalone.data', header=None)

In [6]:
#Rename columns with a dictionary

df = df.rename(columns={0: 'Sex',
                        1: 'Length', 
                        2: 'Diameter',
                        3: 'Height',
                        4: 'Whole Weight',
                        5: 'Shgucked Weight', 
                        6: 'Viscera Weight',
                        7: 'Shell Weight', 
                        8: 'Rings'})

In [7]:
# check for duplicates
df.duplicated().sum()

0

In [8]:
#Check the info of the data.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              4177 non-null   object 
 1   Length           4177 non-null   float64
 2   Diameter         4177 non-null   float64
 3   Height           4177 non-null   float64
 4   Whole Weight     4177 non-null   float64
 5   Shgucked Weight  4177 non-null   float64
 6   Viscera Weight   4177 non-null   float64
 7   Shell Weight     4177 non-null   float64
 8   Rings            4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [9]:
# Check the class balance
df['Sex'].value_counts()

M    1528
I    1342
F    1307
Name: Sex, dtype: int64

In [10]:
# split data into train and test sets.
# notice that stratify = y will give the same proportions in our train and test sets
X = df.drop(columns = 'Sex')
y = df['Sex']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [11]:
# instantiate scaler and one hot encoder
scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [15]:
# instantiate the column selectors
num_sel = make_column_selector(dtype_include = 'number')
cat_sel = make_column_selector(dtype_include = 'object')

In [16]:
# create tuples and put them in a column transformer
num_tup = (scaler, num_sel)
cat_tup = (encoder, cat_sel)

In [None]:
# create and fit a KNN model
# use a pipeline with the column transformer and the model


In [None]:
# get paramaters out from the pipeline


In [None]:
# If we wanted to stop here, we could now get the accuracy


# GridSearchCV
Since we just tried the default model, we should tune the model to optimize our results
to understand your options, use the documentation...the link is provided below.
It is beyond the scope of this course to discuss all of the hyperparameters, but you can try them out and see 

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
# Using the documentation as your guide, define a dictionary of the pararameters 
# you want to tune and the values you want to try out


In [None]:
# Instantiate a gridsearch with the model you want to use and your hyperparamter dictionary
# Depending on how many parameters you are testing and how many options you are trying, this may take awhile!


In [None]:
# Check the optimal combinations of hyperparameters


In [None]:
# extract the best version of the model from the gridsearch object


In [None]:
# Run a classification report on both train and test datasets for the best model. 
# How is your model doing?  Did the performance improve? How is the overfitting?


In [None]:
# print accurcay score for both the training and testing sets.


In [None]:
# Your turn! Try creating a pipeline with gridsearchCV using two other classification models! 
# Remember to explore the hyperparameter options in the documentation for the model