In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, max_error, r2_score
from sklearn.preprocessing import RobustScaler, MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [138]:
# Loading the data
data = pd.read_csv("../input/abalone-dataset/abalone.csv")

## EDA

In [139]:
data.head()

In [140]:
# Adding 1.5 to the rings to get age and dropping the rings column
data["Age"] = data["Rings"] + 1.5
data= data.drop("Rings", axis= 1)

In [141]:
# Checking for Missing Value
data.isnull().sum()

In [142]:
# Getting an overall insight of the data
data.describe()

Height of a Abalone cannot be 0. So, rows having value 0 in height will be treated as "Missing" Value.

In [143]:
#Checking for the data shape
data.shape

In [144]:
data_col = data.columns

In [145]:
#Checking for unique values
for c in data_col:
    print("{} has {} unique values".format(c, data[c].nunique()))

### Visualization

In [146]:
continuous_col = [i for i in data_col if data[i].nunique() > 10 and i != "Age"]
discrete_col = [i for i in data_col if i not in continuous_col]

In [147]:
# Distribution of data
for c in continuous_col:
    sns.displot(data[c])

In [148]:
# Checking for outliers
for c in continuous_col:
    sns.boxplot(x = data[c])
    plt.show()

We will have to deal with outliers in Feature Engineering section

In [149]:
corr_cols = data.corr()["Age"]
corr_cols =corr_cols.sort_values(ascending=True)
# Correlation graph
plt.xticks(rotation=45)
plt.title("Correlation Table")
plt.bar(x = corr_cols.index, height= corr_cols.values)

In [150]:
# Gender distribution
plt.bar(height = data["Sex"].value_counts().values, x= data["Sex"].value_counts().index)
plt.show()

In [151]:
# gender v/s rings
x = data.groupby("Sex")["Age"].median().index
height = data.groupby("Sex")["Age"].median().values
plt.bar(x= x, height=height)

## Feature Engineering

In [152]:
# Replacing values having 0 in "Height" Column to the  median value of the  column
data.loc[(data["Height"] == 0),'Height']= data["Height"].median()

In [153]:
# One Hot Encoding of "Sex" Column
sex_col= pd.get_dummies(data["Sex"], drop_first= True, prefix="sex", prefix_sep='_')
data = data.drop("Sex", axis=1)
data = pd.concat([data, sex_col], axis= 1)

In [154]:
def model_eval(x_train, x_test, y_train, y_test, model):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    return mse, r2

In [155]:
def test_models(x_train, x_test, y_train , y_test):
    models = [RandomForestRegressor(), LinearRegression(), Ridge(), SGDRegressor()]
    for m in models:
        model= m
        print("======================")
        mse, r2 = model_eval(x_train, x_test, y_train, y_test, model)
        print(m)
        print("mse:", mse)
        print("r2:", r2)
        print("======================")
    

In [156]:
X = data.drop("Age", axis= 1)
Y= data["Age"]

In [157]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2)

In [158]:
# test_models(x_train, x_test, y_train, y_test)

### Feature Selection


In [159]:
def select_features(x_train, y_train, x_test, n):
    fs = SelectKBest(score_func=f_classif, k= n )
    fs.fit(x_train, y_train)
    x_train_fs = fs.transform(x_train)
    x_test_fs = fs.transform(x_test)
    return x_train_fs, x_test_fs

In [160]:
# Testing models on deffierent number of features
n_select = [2, 3, 4, 5, 6, 7, 8, "all"]
for n in n_select:
    x_train_fs, x_test_fs = select_features(x_train , y_train , x_test, n)
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Features: ", n)
    print("+++++++++++++++++")
    test_models(x_train_fs, x_test_fs, y_train, y_test)
    print("+++++++++++++++++")

Reducing Features do not have any effect on the model.

## Feature Transformation

### Scaling Input Data

In [161]:

scaler = RobustScaler() # I am using Robust Scaler because the dataset has Outliers
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [162]:
test_models(x_train_scaled, x_test_scaled, y_train, y_test)

We can see some improvement in modes.

### Scaling Traget Data

In [163]:
y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0], 1)

In [164]:
y_test = np.array(y_test)
y_test = y_test.reshape(y_test.shape[0], 1)

In [165]:
target_scaler = MinMaxScaler()
target_scaler.fit(y_train)
y_train_scaled = target_scaler.transform(y_train)
y_test_scaled = target_scaler.transform(y_test)

In [166]:
y_train_scaled = y_train_scaled.reshape(y_train_scaled.shape[0])
y_test_scaled = y_test_scaled.reshape(y_test_scaled.shape[0])


In [167]:
#Lets test the models again
test_models(x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled)

There is very little improvement.

### Normalizing the input data distribution

In [168]:
x_train_data_transformed = pd.DataFrame(x_train_scaled, columns= X.columns )
x_test_data_transformed = pd.DataFrame(x_test_scaled, columns= X.columns )

In [169]:
power_transformer = PowerTransformer(method= 'yeo-johnson')
power_transformer.fit(x_train_data_transformed)
x_train_data_normalized = power_transformer.transform(x_train_data_transformed)
x_test_data_normalized = power_transformer.transform(x_test_data_transformed)

In [170]:
test_models(x_train_data_normalized, x_test_data_normalized, y_train_scaled, y_test_scaled)

### Dimensionality Reduction

In [171]:
y_train_scaled = y_train_scaled.reshape(y_train_scaled.shape[0],1)
y_train_scaled.shape

In [172]:
y_train_scaled = y_train_scaled.reshape(y_train_scaled.shape[0])

In [173]:
features = [1, 2, 3,4, 5, 6, 7, 8, 9]
for i in features:
    lda = PCA(n_components=i)
    lda.fit(x_train_data_transformed)
    x_train_reduced = lda.transform(x_train_data_normalized)
    x_test_reduced = lda.transform(x_test_data_normalized)
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", i)
    print("################################################")
    test_models(x_train_reduced, x_test_reduced, y_train_scaled, y_test_scaled)
    print("################################################")

Having all 9 features model tends to perfom good. So, dimentionality reduction won't work

### Hyperperameter Tunning

In [186]:
search1 = GridSearchCV(RandomForestRegressor(), {
    "n_estimators": [10, 20, 40, 50, 60],
    "max_depth": [10, 20, 40 ],
})

In [187]:
search1.fit(x_train_reduced, y_train_scaled)

In [188]:
search1.best_estimator_

In [189]:
final_model =RandomForestRegressor(max_depth=10, n_estimators=50)
final_model.fit(x_train_reduced, y_train_scaled)

In [197]:
pred = final_model.predict(x_test_reduced)

In [198]:
pred.shape

In [200]:
mse = mean_squared_error(y_test_scaled, pred)
r2 = r2_score(y_test_scaled, pred)

In [201]:
print("MSE:", mse)
print("r2:", r2)