# Import Libraries

In [5]:
#we import all the libraries that we will need.

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

print('The Libraries needed have been imported')

The Libraries needed have been imported


# Getting the Dataset

The number of rings is the value to predict.

In [6]:
data=pd.read_csv("abalone.csv")

In [7]:
data

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [8]:
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [9]:
print(data.shape)

(4177, 9)


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [11]:
data.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


All features are continuous variables except for the Sex feature. Then in the Height feature, the minimum value is zero. This possibility calls for a missing value in the data and we will process the missing value.

In [12]:
#Target is Ring column in this case:

data['Rings'].value_counts().sort_index()

1       1
2       1
3      15
4      57
5     115
6     259
7     391
8     568
9     689
10    634
11    487
12    267
13    203
14    126
15    103
16     67
17     58
18     42
19     32
20     26
21     14
22      6
23      9
24      2
25      1
26      1
27      2
29      1
Name: Rings, dtype: int64

We can see that the target is 1 to 29 (but there is no 28), so the classification we are going to do is a multi-class classification

Data Preprocessing 

Dealing with missing values

We first check how many missing values are in the Height feature and which class is it in.

In [13]:
(data.Height == 0).sum()

2

In [14]:
data[data['Height'] == 0]

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
1257,I,0.43,0.34,0.0,0.428,0.2065,0.086,0.115,8
3996,I,0.315,0.23,0.0,0.134,0.0575,0.0285,0.3505,6


The number of missing values is 2 and is in the infant sex. Then we change the value 0 to null. We will fill in the missing value with the average Height feature for the infant gender 

In [15]:
means = pd.pivot_table(data, index=['Sex'], aggfunc={'Height':np.mean})
means

Unnamed: 0_level_0,Height
Sex,Unnamed: 1_level_1
F,0.158011
I,0.107996
M,0.151381


So we will fill in the missing value with 0.107996

In [16]:
data['Height'] = data['Height'].replace(to_replace=0, value=0.0107996)

# Encoding categorical features

As we have seen, the Sex feature is a categorical feature so we need to encode that feature. We’ll do one-hot-encoding for this.

In [17]:
data = pd.get_dummies(data)

data

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,0,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,0,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,1,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,0,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,1,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,0,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,0,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,1,0,0


After we do the encoding, the column increases to 11.

# Splitting data

We will separate the data into 3 parts, namely train set, the test set and validation set. We do this because it is impossible for us to cross validate the data because there are several targets that only have the amount of 1.

In [18]:
feature =data.drop('Rings', axis=1)
target = data['Rings']

x = feature.values
y = target.values

In [19]:
x_trainval, x_test, y_trainval, y_test = train_test_split(x, y, random_state=0)

x_train, x_valid, y_train, y_valid = train_test_split(x_trainval, y_trainval, random_state=1)

print("Size of training set: {}  size of validation set: {}  size of test set:"
     
     " {}\n".format(x_train.shape, x_valid.shape, x_test.shape))

Size of training set: (2349, 10)  size of validation set: (783, 10)  size of test set: (1045, 10)



Data standardization

To speed up learning on classification, we first standardize the data


In [24]:
scaler = MinMaxScaler()
scaler.fit(x_trainval)

x_trainval_scaled = scaler.transform(x_trainval)
x_test_scaled = scaler.transform(x_test)
x_train_scaled = scaler.transform(x_train)
x_valid_scaled = scaler.transform(x_valid)


# Classification

We will classify the dataset using four classifiers, namely logistic regression, random forest, and SVM.
We will also determine the best parameters for each classifier. For determining the best parameter, we do not use cross validation because there are several targets that have a total of 1. To determine the best parameter for each classifier, we use the simple grid search method.

# Logistic regression

The parameters that we will set for classification using logistic regression are C and solver. Since this is a multiclass classification, we determine the solver between newton-cg, sag, saga, lbfgs. Multiclass we set multinomial and penalty we set l2.

In [26]:
best_score = 0

for C in [0.001,0.01,0.1,1,10,100]:
    for solver in ['newton-cg','sag','saga','lbfgs']:
        logreg = LogisticRegression(max_iter=5000, multi_class='multinomial',
                                   C=C, solver=solver, penalty='l2')
        logreg.fit(x_train_scaled, y_train)
        score = logreg.score(x_valid_scaled, y_valid)
        if score > best_score:
            best_score = score
            best_parameters = {'C':C, 'solver':solver}
            
logreg = LogisticRegression(**best_parameters)
logreg.fit(x_trainval_scaled, y_trainval)
train_score = logreg.score(x_trainval_scaled, y_trainval)
test_score = logreg.score(x_test_scaled, y_test)

print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Train set score with best parameters: {:.2f}".format(train_score))
print("Test set score with best parameters: {:.2f}".format(test_score))

Best score on validation set: 0.25
Best parameters:  {'C': 100, 'solver': 'newton-cg'}
Train set score with best parameters: 0.29
Test set score with best parameters: 0.25


Random Forest 

The parameters that we will set for classification using random forest are criterion, max_depth, dan max_features.


In [27]:
best_score = 0

for criterion in ['gini','entropy']:
    for max_depth in [1,2,3,4,5,6,7,8,9,10]:
        for max_features in ['auto','sqrt','log2']:
            rf = RandomForestClassifier(criterion=criterion, 
                                      max_depth=max_depth,
                                      max_features=max_features)
            rf.fit(x_train_scaled, y_train)
            score = rf.score(x_valid_scaled, y_valid)
            
            if score > best_score:
                best_score = score
                best_parameters = {'criterion':criterion,
                                  'max_depth':max_depth,
                                  'max_features':max_features}

rf= RandomForestClassifier(**best_parameters)
rf.fit(x_trainval_scaled, y_trainval)
train_score = rf.score(x_trainval_scaled, y_trainval)
test_score = rf.score(x_test_scaled, y_test)

print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameter:", best_parameters)
print("Train set score with best parameters: {:.2f}".format(train_score))
print("Train set score with best parameters: {:.2f}".format(test_score))

Best score on validation set: 0.27
Best parameter: {'criterion': 'entropy', 'max_depth': 3, 'max_features': 'log2'}
Train set score with best parameters: 0.29
Train set score with best parameters: 0.27


With the best parameters, we get a score for the training set of 0.29 and a testing set score of 0.27

Support Vector Machine
The parameters that we will set for classification using SVM are kernel, C, and gamma.

In [28]:
best_score= 0

for C in [0.0001,0.001,0.01,0.1,1,10,100]:
    for gamma in ['auto','scale']:
        for kernel in ['linear','poly','rbf','sigmoid']:
            svc = SVC(kernel=kernel, C=C, gamma=gamma)
            svc.fit(x_train_scaled, y_train)
            score = svc.score(x_valid_scaled, y_valid)
            
            if score > best_score:
                best_score = score
                best_parameters = {'C':C, 'gamma':gamma, 'kernel':kernel}
        
svc = SVC(**best_parameters)
svc.fit(x_trainval_scaled, y_trainval)
train_score = svc.score(x_trainval_scaled, y_trainval)
test_score = svc.score(x_test_scaled, y_test)

print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Train set score with best parameters: {:.2f}". format(train_score))
print("Test set score with best parameters: {:.2f}".format(test_score))

Best score on validation set: 0.27
Best parameters:  {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
Train set score with best parameters: 0.31
Test set score with best parameters: 0.26


We can observe that every model’s accuracy are below 0.3, which is relatively low and hard forforecasting. This might due to the large number of levels and the highly imbalance in our target.

Based on the above analysis, Random Forest model have the best accuracy among all classification models. However, it’s not significant higher than other models.