# Read Data

In [9]:
import pandas as  pd 

In [10]:
df_fish = pd.read_csv('fish_data.csv')
df_fish

Unnamed: 0,species,length,weight,w_l_ratio
0,Anabas testudineus,10.66,3.45,0.32
1,Anabas testudineus,6.91,3.27,0.47
2,Anabas testudineus,8.38,3.46,0.41
3,Anabas testudineus,7.57,3.36,0.44
4,Anabas testudineus,10.83,3.38,0.31
...,...,...,...,...
4075,Sillaginopsis panijus,30.56,6.12,0.20
4076,Sillaginopsis panijus,29.66,6.11,0.21
4077,Sillaginopsis panijus,32.81,6.25,0.19
4078,Sillaginopsis panijus,29.78,6.11,0.21


In [11]:
df_fish.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4080 entries, 0 to 4079
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   species    4080 non-null   object 
 1   length     4080 non-null   float64
 2   weight     4080 non-null   float64
 3   w_l_ratio  4080 non-null   float64
dtypes: float64(3), object(1)
memory usage: 127.6+ KB


In [12]:
df_fish.describe() 

Unnamed: 0,length,weight,w_l_ratio
count,4080.0,4080.0,4080.0
mean,17.353544,3.739875,0.252782
std,7.114684,1.040365,0.123046
min,6.36,2.05,0.08
25%,11.3275,3.07,0.17
50%,17.35,3.31,0.19
75%,22.585,4.1,0.34
max,33.86,6.29,0.64


In [13]:
pd.unique(df_fish['species'])

array(['Anabas testudineus', 'Coilia dussumieri',
       'Otolithoides biauritus', 'Otolithoides pama', 'Pethia conchonius',
       'Polynemus paradiseus', 'Puntius lateristriga', 'Setipinna taty',
       'Sillaginopsis panijus'], dtype=object)

In [14]:
df_fish['species'].value_counts()

species
Setipinna taty            480
Anabas testudineus        476
Pethia conchonius         475
Otolithoides biauritus    468
Polynemus paradiseus      458
Sillaginopsis panijus     455
Otolithoides pama         435
Puntius lateristriga      418
Coilia dussumieri         415
Name: count, dtype: int64

## Label Encoder

In [15]:
from sklearn.preprocessing import LabelEncoder as label_encoder
le = label_encoder()
df_fish['species'] = le.fit_transform(df_fish['species'])
df_fish['species'].value_counts()

species
7    480
0    476
4    475
2    468
5    458
8    455
3    435
6    418
1    415
Name: count, dtype: int64

In [35]:
print(df_fish['species'].unique())

[0 1 2 3 4 5 6 7 8]


## Split Atribute

In [16]:
x = df_fish.drop('species', axis=1)
y = df_fish['species'] 

## Scalling

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x) 
x = scaler.transform(x) 


## Split Train Test

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=22)

## Training Model

In [23]:
# Inisialisasi model untuk multi-class classification
import xgboost as xgb
model = xgb.XGBClassifier(
    objective='multi:softmax',  # Untuk multi-kelas
    num_class=8,               # Jumlah kelas
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)


model.fit(x_train, y_train)


## Predict Test

In [24]:
# Prediksi data testing
from sklearn.metrics import accuracy_score
y_pred = model.predict(x_test)

# Evaluasi akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 93.95%


In [37]:
new_data = pd.DataFrame({
    'length': [1.7, 3.1],  
    'weight': [0.1, 350],    
    'w_l_ratio': [0.44, 0.75] 
}) 
new_predictions = model.predict(new_data)
new_species = le.inverse_transform(new_predictions)
for i, species in enumerate(new_species):
    print(f"Data baru {i+1}: Prediksi species = {species}")
new_predictions

Data baru 1: Prediksi species = Puntius lateristriga
Data baru 2: Prediksi species = Sillaginopsis panijus


array([6, 8], dtype=int32)

### Dump

In [38]:
import pickle

with open('model_xgb_fish.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('label_encoder_xgbfish.pkl', 'wb') as file:
    pickle.dump(le, file)
