# Crab Age Study

This notebook shows the EDA, PCA and model training done on an ANN-MLP model to determine/predict the age of the crab.

## Initial Data Extraction

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
#for PCA and ANN model
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf

#to extract the data from kaggle
import os

for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#check imported data
crab_train = pd.read_csv('/kaggle/input/playground-series-s3e16/train.csv')
crab_test = pd.read_csv('/kaggle/input/playground-series-s3e16/test.csv')

In [None]:
crab_train.head(10)

In [None]:
crab_test.head(10)

In [None]:
crab_train.describe()

There are 74051 datasets used for training. There appears to be height 0 crabs present in the dataset.

In [None]:
crab_test.describe()

There are 49368 datasets used in testing but we will be focusing on the training dataset. Similarly, there are height 0 crabs present in the test set.

In [None]:
#Looking at sex of the crabs, there needs to be some form of encoding done here.
#We will be using the labelencoder from sklearn
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
crab_train['Sex'] = le.fit_transform(crab_train.Sex.values)

#The same goes for the crab test set too
crab_test['Sex'] = le.fit_transform(crab_test.Sex.values)

In [None]:
crab_train.describe()

In [None]:
crab_test.describe()

# EDA of the crab dataset

In [None]:
#Starting EDA

#removing id from trainset
training_crab = pd.DataFrame(crab_train, columns = ['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight'])

#Taking a look at the boxplot
df_crab = pd.DataFrame(crab_train, columns = ['Sex', 'Length', 'Diameter', 'Height'])
df_crab.boxplot()
plt.show()

In [None]:
df_crab2 = pd.DataFrame(crab_train, columns = ['Weight','Shucked Weight', 'Viscera Weight', 'Shell Weight'])
df_crab2.boxplot()
plt.show()

There are quite a number of outliers but we decided not to remove them as they are quite the number which might affect the accuracy of the model.

In [None]:
#We will plot out the scatterplots for each dataset comparing to age
sb.jointplot(data = crab_train, x = 'Age', y = 'Sex')
sb.jointplot(data = crab_train, x = 'Age', y = 'Length')
sb.jointplot(data = crab_train, x = 'Age', y = 'Diameter')
sb.jointplot(data = crab_train, x = 'Age', y = 'Height')
sb.jointplot(data = crab_train, x = 'Age', y = 'Weight')
sb.jointplot(data = crab_train, x = 'Age', y = 'Shucked Weight')
sb.jointplot(data = crab_train, x = 'Age', y = 'Viscera Weight')
sb.jointplot(data = crab_train, x = 'Age', y = 'Shell Weight')

In [None]:
joints = sb.PairGrid(crab_train)
joints.map_diag(sb.histplot)
joints.map_offdiag(sb.scatterplot)

It does not seem too clear of a relationship between each variable and age. Hence, we will see the correlation of each variable with age.

In [None]:
#Correlation of data
crab_train.corr(method = 'pearson')

We can see that the age of the crab has a high correlation with the Shell Weight, Viscera Weight, Height, Diameter and Length of the crab. (Considering that its above 0.6). However, let's run a PCA to further determine a model better for ANN-MLP.

# PCA 

In [None]:
#Delete ID number from crab_train
crab_train_pca = crab_train.drop('id',axis=1)
crab_train_pca.head(10)

In [None]:
#Standardization of crab_train
scaler = StandardScaler()
crab_train_scaled = scaler.fit_transform(crab_train_pca)
print(crab_train_scaled)

In [None]:
#Making a model
pca = PCA()
pca.fit(crab_train_scaled)

#Calculating principal component score
values = pca.transform(crab_train_scaled)
print(values)

In [None]:
#Convert values to dataframe type

df_pca = pd.DataFrame(data = values,
                     columns = ["Principle component{}".format(x+1) for x in range(len(crab_train_pca.columns))])
df_pca.head(10)

In [None]:
#Calculate contribution
ev_ratio = pca.explained_variance_ratio_
print(ev_ratio)

In [None]:
#Convert ev_ratio to dataframe type
df_evr = pd.DataFrame(data = ev_ratio,
                     columns = ["Contribution"],
                     index = ["Principle component{}".format(x+1) for x in range(len(crab_train_pca.columns))])
print(df_evr)

In [None]:
#Calculate the cumulative contribution
cc_ratio = np.cumsum(ev_ratio)
cc_ratio = np.hstack([0, cc_ratio])
print(cc_ratio)

In [None]:
#Making a graph for cumulative contribution
plt.plot(cc_ratio, "-o")
plt.xlabel("Principle component")
plt.ylabel("Cumulative contribution")
plt.grid()
plt.xticks(range(len(cc_ratio)), range(1, len(cc_ratio) + 1))
plt.show()

In [None]:
#100% ÷ 9 columns = 1.11
#Principles with less than 1.11 contribution rate cannot be said to concentrate information. So they are not important.

# ANN-MLP Model

In [None]:
#for Multiple Layer Perceptron
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [None]:
#Multiple Layer Perstron
random = 0

#Dividing data
x_train, x_test, y_train, y_test = train_test_split(df_pca[['Principle component1', 'Principle component2']], crab_train['Age'], test_size = 0.3, random_state = random)

print("x_train")
print(x_train.head())

In [None]:
print("x_test")
print(x_test.head())

In [None]:
print("y_train")
print(y_train.head())

In [None]:
print("y_test")
print(y_test.head())

In [None]:
#Making a model to calculating Mean Squared Error to validify the training model
clf = MLPRegressor()
clf.fit(x_train, y_train)

predicted = clf.predict(x_test)

print(predicted)

In [None]:
mse = mean_squared_error(y_test, predicted)
print(mse)

## Training against test model

In [None]:
#same steps as before but for test
crab_test_pca = crab_test.drop('id',axis=1)
crab_test_pca.head(10)

In [None]:
#Standardization of crab_test
scaler = StandardScaler()
crab_test_scaled = scaler.fit_transform(crab_test_pca)
print(crab_test_scaled)

In [None]:
#Making a model
pca_test = PCA()
pca_test.fit(crab_test_scaled)

#Calculating principal component score
values_test = pca_test.transform(crab_test_scaled)
print(values_test)

In [None]:
#Convert values to dataframe type

df_pca_test = pd.DataFrame(data = values_test,
                     columns = ["Principle component{}".format(x+1) for x in range(len(crab_test_pca.columns))])
df_pca_test.head(10)

In [None]:
#Calculate contribution
ev_ratio_test = pca_test.explained_variance_ratio_
print(ev_ratio_test)

In [None]:
#Convert ev_ratio to dataframe type
df_evr_test = pd.DataFrame(data = ev_ratio_test,
                     columns = ["Contribution"],
                     index = ["Principle component{}".format(x+1) for x in range(len(crab_test_pca.columns))])
print(df_evr_test)

In [None]:
#Calculate the cumulative contribution
cc_ratio_test = np.cumsum(ev_ratio_test)
cc_ratio_test = np.hstack([0, cc_ratio_test])
print(cc_ratio_test)

In [None]:
#Making a graph for cumulative contribution
plt.plot(cc_ratio_test, "-o")
plt.xlabel("Principle component")
plt.ylabel("Cumulative contribution")
plt.grid()
plt.xticks(range(len(cc_ratio_test)), range(1, len(cc_ratio_test) + 1))
plt.show()

In [None]:
test_set = df_pca_test[['Principle component1', 'Principle component2']]

## First submission (PCA + ANN)

In [None]:
#predicted_age = clf.predict(test_set)
#print(predicted_age)

In [None]:
#check size
#len(predicted_age)

In [None]:
#creating index column for test data
#submission_id = crab_test["id"]
#submission_id = submission_id.reset_index(drop=True)

In [None]:
#submission_id

In [None]:
#pred_age = pd.Series(predicted_age)
#pd.set_option("display.precision", 0)
#pred_age 

In [None]:
#submission_data = pd.concat([submission_id, pred_age], join = 'outer', axis = 1)
#submission_data = submission_data.rename(columns = {submission_data.columns[1]: "Age"})
#submission_data

In [None]:
#extract submission dataframe
#submission = submission_data.to_csv('submission_data.csv', index = False)

## Second submission (PCA + ANN)

In [None]:
#clf = MLPRegressor(activation='relu', alpha=0.001, hidden_layer_sizes=(50, 30), solver='adam')

#clf.fit(x_train, y_train)

#predicted_age = clf.predict(test_set)

#submission_id = crab_test["id"]
#submission_id = submission_id.reset_index(drop=True)

#pred_age = pd.Series(predicted_age)
#pd.set_option("display.precision", 0)

#submission_data = pd.concat([submission_id, pred_age], join = 'outer', axis = 1)
#submission_data = submission_data.rename(columns = {submission_data.columns[1]: "Age"})


In [None]:
#print(submission_data)
#len(submission_data)
#submission = submission_data.to_csv('submission_data.csv', index = False)

## Third submission (XGBoost)

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=1000, reg_alpha=1.0, reg_lambda=1.0, subsample=0.8)
xgb_model.fit(x_train, y_train)

predicted_age = xgb_model.predict(test_set)

submission_id = crab_test["id"]
submission_id = submission_id.reset_index(drop=True)

pred_age = pd.Series(predicted_age)
pd.set_option("display.precision", 0)

submission_data = pd.concat([submission_id, pred_age], join = 'outer', axis = 1)
submission_data = submission_data.rename(columns = {submission_data.columns[1]: "Age"})


In [None]:
print(submission_data)
len(submission_data)
submission = submission_data.to_csv('submission_data.csv', index = False)
submission