# XGboost classification model

In [12]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from evolution import rmsle

In [2]:
# preparing dataset 
df = pd.read_csv("../data/train.csv")

# standardize the features
scaler = StandardScaler()
df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']] = scaler.fit_transform(df[['Length', "Diameter", 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])

#labelling the categorical data
df['Sex'] = df['Sex'].map({'F': 1, 'I': 2, 'M': 0})

print("Total number of unique rings:",len(df.Rings.unique()))
df.head()


Total number of unique rings: 28


Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,1,0.278317,0.288912,0.382451,-0.038314,-0.060061,-0.227155,0.108309,11
1,1,1,0.955044,0.900996,0.250897,0.745005,0.573416,1.061143,0.722736,11
2,2,2,-3.020727,-2.975535,-2.906386,-1.678148,-1.640084,-1.649238,-1.69657,6
3,3,0,0.658976,0.747975,0.382451,0.27414,0.16985,0.357534,0.185113,10
4,4,2,0.320613,0.237905,-0.143763,-0.015371,0.140499,-0.09337,-0.218105,9


In [3]:
df.Rings.unique()

array([11,  6, 10,  9,  4,  8, 15,  7, 12, 20, 17, 13, 14,  5, 23,  3, 22,
       16, 18, 19, 21, 25,  1, 29,  2, 27, 24, 26], dtype=int64)

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Rings'] = le.fit_transform(df['Rings'])
df.Rings.unique()

array([10,  5,  9,  8,  3,  7, 14,  6, 11, 19, 16, 12, 13,  4, 22,  2, 21,
       15, 17, 18, 20, 24,  0, 27,  1, 26, 23, 25], dtype=int64)

In [17]:
# making the xgboost model
X_train, X_test, y_train, y_test = train_test_split(df[['Sex','Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']], df['Rings'], test_size=0.33, random_state=42)

xgb_model = XGBRegressor(
    max_depth=6,
    n_estimators=330,
    objective='reg:squaredlogerror',
    nthread=4,
    random_state=0,
)

xgb_model.fit(df[['Sex','Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']], df['Rings'])
preds = xgb_model.predict(X_test)

print(preds)


[ 7.6398597  6.3472967  8.601193  ... 14.100699  11.066991   9.385225 ]


In [26]:
np.floor(preds).astype(int)


array([ 7,  6,  8, ..., 14, 11,  9])

In [27]:
# print("Accuracy score : ", accuracy_score(y_test, preds))
print("RMSLE score :", rmsle(y_test, np.floor(preds).astype(int)))
# print("Classification report:\n", classification_report(y_test, preds))

RMSLE score : 0.17277025604095445


In [28]:
import pickle
with open("../trained_models/xgboost.pkl", "wb") as f:
    pickle.dump(xgb_model, f)


In [29]:
# submission
test_df = pd.read_csv("../data/test.csv", index_col="id")
test_df['Sex'] = test_df['Sex'].map({'F': 1, 'I': 2, 'M': 0})

test_df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']] = scaler.fit_transform(test_df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])

test_df['pred_Rings'] = xgb_model.predict(test_df[['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])
test_df.to_csv('../data/modified_test.csv', index=False)

sub = pd.DataFrame({'id': test_df.index, 'Rings': np.floor(test_df['pred_Rings'].values).astype(int)})
sub.to_csv("../submission/xgboost.csv", index=False)

In [30]:
# submitting to kaggle
!kaggle competitions submit -c playground-series-s4e4 -f ../submission/xgboost.csv -m "xgboost model submission"

Successfully submitted to Regression with an Abalone Dataset



  0%|          | 0.00/595k [00:00<?, ?B/s]
  1%|▏         | 8.00k/595k [00:00<00:44, 13.6kB/s]
 23%|██▎       | 136k/595k [00:00<00:01, 256kB/s]  
 36%|███▋      | 216k/595k [00:00<00:01, 364kB/s]
 51%|█████     | 304k/595k [00:00<00:00, 481kB/s]
 65%|██████▍   | 384k/595k [00:01<00:00, 541kB/s]
 78%|███████▊  | 464k/595k [00:01<00:00, 572kB/s]
 94%|█████████▍| 560k/595k [00:01<00:00, 666kB/s]
100%|██████████| 595k/595k [00:02<00:00, 208kB/s]
