#### This notebook consists code related to regression models


### first load the dataset

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
dataset = pd.read_csv("../data/train.csv", index_col="id")
dataset.head(10)

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9
5,F,0.61,0.48,0.17,1.201,0.5335,0.3135,0.3085,10
6,M,0.415,0.325,0.11,0.3315,0.1655,0.0715,0.13,9
7,F,0.61,0.49,0.15,1.1165,0.4955,0.2945,0.295,9
8,I,0.205,0.15,0.04,0.046,0.0145,0.0105,0.01,4
9,I,0.565,0.425,0.125,0.651,0.3795,0.142,0.18,8


### coding categorical data to numemrical form

In [3]:
from sklearn.preprocessing import LabelEncoder

labelizer = LabelEncoder()
dataset['Sex'] = labelizer.fit_transform(dataset['Sex'])

dataset['Sex'].value_counts()
dataset.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,0,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,1,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,1,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [4]:
# applying pca on the dataset
# since all features are highly correlated with each other let's view unimporteant features
# to cater that we will use pca
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# scaling dataset 

scaler = StandardScaler()
X_length = scaler.fit_transform(dataset[['Length', 'Diameter', 'Height']])
X_weight = scaler.fit_transform(dataset[['Whole weight', 'Whole weight.1','Whole weight.2', 'Shell weight']])


cov_matrix_length = np.cov(X_length.T)
cov_matrix_weight = np.cov(X_weight.T)


eiganValues_len, eiganVector_len = np.linalg.eig(cov_matrix_length)
eiganValues_weight, eiganVector_weight = np.linalg.eig(cov_matrix_weight)


# applying PCA
pca = PCA(n_components=2)
pca2 = PCA(n_components=2)
lenght_pca = pca.fit_transform(X_length)
weight_pca = pca2.fit_transform(X_weight)

print("Explained variance ratio of length: ", pca.explained_variance_ratio_)
print("Explained variance ratio of weight: ", pca2.explained_variance_ratio_)


Explained variance ratio of length:  [0.96134764 0.03524452]
Explained variance ratio of weight:  [0.96356954 0.0223365 ]


In [5]:
dataset['length_pca_X'] = lenght_pca[:,0]
dataset['weight_pca_X'] = weight_pca[:,0]
dataset['length_pca_y'] = lenght_pca[:,1]
dataset['weight_pca_y'] = weight_pca[:,1]

### Training gradiend boosting model

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# splitting datasets
X_train, X_test, y_train, y_test = train_test_split(dataset[['Sex','Length','Diameter','Height','Whole weight','Whole weight.1','Whole weight.2','Shell weight']],dataset['Rings'], test_size=0.2)

In [8]:
# trianing model : gradient boosting
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)

In [9]:
# it tooks nearly 10 minutes to train let's store it for future use
import pickle

with open("../trained_models/gradientboosting1.pkl", "wb") as f:
    pickle.dump(gbm, f)

In [10]:
y_pred = gbm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

# generate classification report
print("Classification Report : ", classification_report(y_test, y_pred))

Accuracy:  0.36263311813717375
Classification Report :                precision    recall  f1-score   support

           1       0.00      0.00      0.00         8
           2       0.00      0.00      0.00         3
           3       0.13      0.02      0.04        87
           4       0.50      0.61      0.55       295
           5       0.45      0.49      0.47       594
           6       0.43      0.39      0.41      1074
           7       0.42      0.50      0.46      1764
           8       0.45      0.47      0.46      2832
           9       0.39      0.57      0.46      3469
          10       0.25      0.27      0.26      2488
          11       0.35      0.31      0.32      1695
          12       0.17      0.01      0.02       990
          13       0.17      0.18      0.18       840
          14       0.18      0.05      0.08       490
          15       0.17      0.04      0.06       433
          16       0.15      0.11      0.13       278
          17       0.13  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
import pickle

with open("../trained_models/gradientboosting1.pkl", "rb") as f:
    model = pickle.load(f)



In [12]:
type(model)

sklearn.ensemble._gb.GradientBoostingClassifier

In [13]:
import pandas as pd 
import numpy as np


In [14]:
test_df = pd.read_csv("../data/test.csv")

In [15]:
# making pca
# since all features are highly correlated with each other let's view unimporteant features
# to cater that we will use pca
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# scaling dataset 

scaler = StandardScaler()
X_length = scaler.fit_transform(test_df[['Length', 'Diameter', 'Height']])
X_weight = scaler.fit_transform(test_df[['Whole weight', 'Whole weight.1','Whole weight.2', 'Shell weight']])


cov_matrix_length = np.cov(X_length.T)
cov_matrix_weight = np.cov(X_weight.T)


eiganValues_len, eiganVector_len = np.linalg.eig(cov_matrix_length)
eiganValues_weight, eiganVector_weight = np.linalg.eig(cov_matrix_weight)


# applying PCA
pca = PCA(n_components=2)
pca2 = PCA(n_components=2)
lenght_pca = pca.fit_transform(X_length)
weight_pca = pca2.fit_transform(X_weight)

print("Explained variance ratio of length: ", pca.explained_variance_ratio_)
print("Explained variance ratio of weight: ", pca2.explained_variance_ratio_)

test_df['length_pca_X'] = lenght_pca[:,0]
test_df['weight_pca_X'] = weight_pca[:,0]
test_df['length_pca_y'] = lenght_pca[:,1]
test_df['weight_pca_y'] = weight_pca[:,1]


Explained variance ratio of length:  [0.95786098 0.03872526]
Explained variance ratio of weight:  [0.96325727 0.02259852]


In [16]:
from sklearn.preprocessing import LabelEncoder

labelizer = LabelEncoder()
test_df['Sex'] = labelizer.fit_transform(test_df['Sex'])

# test_df['Sex'].value
test_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,length_pca_X,weight_pca_X,length_pca_y,weight_pca_y
0,90615,2,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005,1.354371,2.166588,-0.323466,-0.602731
1,90616,2,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275,1.016834,0.983354,0.073127,-0.198119
2,90617,2,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405,0.381807,0.21201,-0.129826,0.02983
3,90618,2,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235,0.924357,0.239256,-0.334105,-0.0065
4,90619,1,0.415,0.325,0.11,0.358,0.1575,0.067,0.105,-1.349745,-1.897034,0.109914,-0.014429


In [17]:
test_pred = model.predict(test_df[['Sex','Length','Diameter','Height','Whole weight','Whole weight.1','Whole weight.2','Shell weight']])

In [18]:
test_pred

array([ 9,  9,  9, ..., 10, 11,  7], dtype=int64)

In [19]:
sub = pd.DataFrame({'id': test_df['id'], 'Rings': test_pred})
sub.head()

Unnamed: 0,id,Rings
0,90615,9
1,90616,9
2,90617,9
3,90618,10
4,90619,7


In [20]:
sub.to_csv('../submission/submission.csv', index=False)

In [21]:
!kaggle competitions submit -c playground-series-s4e4 -f ../submission/submission.csv -m "second submission"

Successfully submitted to Regression with an Abalone Dataset



  0%|          | 0.00/600k [00:00<?, ?B/s]
  1%|▏         | 8.00k/600k [00:00<00:52, 11.4kB/s]
 76%|███████▌  | 456k/600k [00:00<00:00, 757kB/s]  
100%|██████████| 600k/600k [00:03<00:00, 162kB/s]
