**Essential Imports**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import  train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import  RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.linear_model import  Ridge
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.model_selection import  GridSearchCV

**Reading the dataset**

In [None]:
dataframe = pd.read_csv("dataset/abalone.csv")

View First 5 rows

In [None]:
dataframe.head()

Set Age to be number of rings multiplied by 1.5 (the trained model will predict the age of the abalone)

In [None]:
dataframe['age'] = dataframe['Rings']*1.5
dataframe.drop("Rings", axis=1, inplace=True)

In [None]:
dataframe.head()

Printout the shape of the dataset

In [None]:
print(f"""There are {dataframe.shape[0]} rows (observations) with {dataframe.shape[1]} columns (features)""")

Get info about the dataset (datatypes and missing values)

In [None]:
dataframe.describe()

In [None]:
dataframe.info()

Encoding sex to integer values

In [None]:
dataframe['Sex'].unique()

In [None]:
def encode_gender(gender):
    gender_encode_dictionary = {"F":0, "I":1, "M":2}
    return gender_encode_dictionary[gender]

dataframe['Sex'] = dataframe['Sex'].apply(lambda x: encode_gender(x))

In [None]:
dataframe.head()

Calculate skewness of the features

In [None]:
dataframe.skew()

Checking missing values

In [None]:
dataframe.isnull().sum()

Visualize data using Swarmplot and Violinplot with respect to sex and age

In [None]:
plt.figure(figsize = (20,7))
sns.swarmplot(x = 'Sex', y = 'age', data = dataframe, hue = 'Sex')
sns.violinplot(x = 'Sex', y = 'age', data = dataframe)

Group data by sex and use boxplot to check for anomalies

In [None]:
# Analysis per sex
columns = list(dataframe.columns)
columns.remove("Sex")
dataframe.groupby(['Sex'])[columns].mean().sort_values('age')

In [None]:
dataframe.boxplot( rot = 90, figsize=(20,5))

Use pairplot to show relatioship between combinations of columns

In [None]:
sns.pairplot(dataframe[columns])

Use Heatmap to illustrate correlations between columns in the dataset

In [None]:
plt.figure(figsize=(20,7))
sns.heatmap(dataframe[columns].corr(), annot=True)

Use scatter plot to manually drop outliers in the dataset

In [None]:
columns

In [None]:
column = 'Length'
plt.scatter(x = dataframe[column], y = dataframe['age'])
plt.grid(True)

In [None]:
dataframe.drop(dataframe[(dataframe[column]<0.1) & (dataframe['age'] < 5)].index, inplace=True)
dataframe.drop(dataframe[(dataframe[column]<0.8) & (dataframe['age'] > 25)].index, inplace=True)
dataframe.drop(dataframe[(dataframe[column]>=0.8) & (dataframe['age']< 25)].index, inplace=True)

In [None]:
column = 'Diameter'
plt.scatter(x = dataframe[column], y = dataframe['age'])
plt.grid(True)

In [None]:
dataframe.drop(dataframe[(dataframe[column]>0.6) & (dataframe['age'] > 25)].index, inplace=True)


In [None]:
column = 'Height'
plt.scatter(x = dataframe[column], y = dataframe['age'])
plt.grid(True)

In [None]:
dataframe.drop(dataframe[(dataframe[column]>0.4) & (dataframe['age'] < 15)].index, inplace=True)
dataframe.drop(dataframe[(dataframe[column]>0.5) & (dataframe['age'] == 15)].index, inplace=True)


In [None]:
dataframe.head()

In [None]:
column = 'Shucked weight'
plt.scatter(x = dataframe[column], y = dataframe['age'])
plt.grid(True)

In [None]:
dataframe.drop(dataframe[(dataframe[column]>1.4) & (dataframe['age']>15)].index, inplace=True)

In [None]:
column = 'Viscera weight'
plt.scatter(x = dataframe[column], y = dataframe['age'])
plt.grid(True)

In [None]:
dataframe.drop(dataframe[(dataframe[column]>0.6) & (dataframe['age']>15)].index, inplace=True)

In [None]:
column = 'Shell weight'
plt.scatter(x = dataframe[column], y = dataframe['age'])
plt.grid(True)

In [None]:
dataframe.drop(dataframe[(dataframe[column]>0.8)].index, inplace=True)

Separate dataset columns (features) to independant columns X and dependant colmun y (target)

In [None]:
X = dataframe.drop('age', axis=1)
y = dataframe['age']

Scale the independant columns to be in the standard form

In [None]:
standardScaler = StandardScaler()
X_scaled = standardScaler.fit_transform(X)

Split the dataset into training and testing datasets 

In [None]:
X_train, X_test, y_train, y_test =train_test_split(X_scaled, y, test_size=25e-2)

Use Cross validation provided by sklearn to compute the error among a group of models to find the best match

In [None]:
np.random.seed(123)
def rmse_cv(model, X_train, y):
    rmse =- (cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', cv=5))
    return(rmse*100)

models = [
    LinearRegression(),
    Ridge(),
    SVR(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    XGBRegressor()
]

names = [
    'Linear Regressor',
    'Ridge Regressor',
    'Support Vector Machine Regressor',
    'Random Forest Regressor',
    'Gradient Boosting Regressor',
    'K-Nearest Neighbors Regressor',
    'Extreme Gradient Boosting Regressor',
]

for model, name in zip(models, names):
    score = rmse_cv(model, X_train, y_train)
    print(f"{name} Model:\n{round(score.mean(),2)}")

Afterwards, to use the Grid Search algorithm to try multiple hyperparams of the model with lowest error

In [None]:
parameters = {
                'learning_rate': [0.01,0.001,0.0001],
                'subsample'    : [0.75, 0.5, 0.25],
                'n_estimators' : [100, 500, 1000],
                'max_depth'    : [2, 8, 32]
            }
GBMGrid = GridSearchCV(
                estimator=GradientBoostingRegressor(),
                param_grid=parameters,
                cv=3,
                n_jobs=-1)
GBMGrid.fit(X_train, y_train)
GBMGrid.best_params_, GBMGrid.best_score_

Training the best model with the best params according to Grid Search

In [None]:
model = GradientBoostingRegressor(
    learning_rate= 0.01,
    max_depth= 2,
    n_estimators= 1000,
    subsample= 0.25)
model.fit(X_train, y_train)

To calculate the error of the model using Mean Squared Error

In [None]:
y_pred = model.predict(X_test)
score = mean_squared_error(y_test, y_pred)