In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [2]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
filename = '/content/drive/Othercomputers/My Laptop/data-science-assignments/05 - Intro to Machine Learning/abalone.data'

In [4]:
df = pd.read_csv(filename,header=0)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole_weight    4177 non-null   float64
 5   Shucked_weight  4177 non-null   float64
 6   Viscera_weight  4177 non-null   float64
 7   Shell_weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [6]:
print(df.isnull().sum())
print('\n')
print(df.isna().sum())

Sex               0
Length            0
Diameter          0
Height            0
Whole_weight      0
Shucked_weight    0
Viscera_weight    0
Shell_weight      0
Rings             0
dtype: int64


Sex               0
Length            0
Diameter          0
Height            0
Whole_weight      0
Shucked_weight    0
Viscera_weight    0
Shell_weight      0
Rings             0
dtype: int64


In [7]:
#Creating the dataframes for X and y
y = df['Rings']
X = df.drop(columns=['Rings'])

In [8]:
#Creating the Train Test Split
#random_state of 42 is overfitting the model
#but the instructions said to use 42, keeping the random_state value at 42
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
#Making column selectors
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)

print('numeric columns are', num_columns)
print('categorical columns are', cat_columns)

numeric columns are ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']
categorical columns are ['Sex']


In [10]:
#Instantiating transformers
#median_imputer = SimpleImputer(strategy = 'median',copy=False)
#freq_imputer = SimpleImputer(strategy = 'most_frequent',copy=False)
#one_hot_encoder = OneHotEncoder(sparse = False, handle_unknown='error',drop='first')

median_imputer = SimpleImputer(strategy = 'median')

one_hot_encoder = OneHotEncoder(sparse = False, handle_unknown='ignore')

In [11]:
#Instantiate the scaler
scaler = StandardScaler()

In [12]:
#Matching transformers with columns
#impute median of numeric columns
median_tuple = (median_imputer, num_selector)

#impute scaler tuple
scaler_tuple = (scaler, num_selector)

#impute most frequent of categorical columns
#freq_tuple = (freq_imputer, cat_selector)

#one-hot encode categorical columns
ohe_tuple = (one_hot_encoder, cat_selector)

In [13]:
column_transformer = make_column_transformer(median_tuple,scaler_tuple,ohe_tuple)

In [14]:
lin_reg = LinearRegression()

In [15]:
pipe = make_pipeline(column_transformer, lin_reg)

In [16]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f...
                                                  <sklearn.compose._column_tran

In [17]:
train_score = r2_score(y_train, pipe.predict(X_train))
test_score = r2_score(y_test, pipe.predict(X_test))
print('Train R2: {}'.format(train_score))
print('Test R2: {}'.format(test_score))
if test_score > train_score:
    print('Appears over-fitting has occured')

Train R2: 0.5344809048076
Test R2: 0.545451060625584
Appears over-fitting has occured
