# Read Data

In [1]:
import pandas as  pd 

In [2]:
df_fruit = pd.read_excel('fruit.xlsx')
df_fruit

Unnamed: 0,diameter,weight,red,green,blue,name
0,2.96,86.76,172,85,2,orange
1,3.91,88.05,166,78,3,orange
2,4.42,95.17,156,81,2,orange
3,4.47,95.60,163,81,4,orange
4,4.48,95.76,161,72,9,orange
...,...,...,...,...,...,...
9995,15.35,253.89,149,77,20,grapefruit
9996,15.41,254.67,148,68,7,grapefruit
9997,15.59,256.50,168,82,20,grapefruit
9998,15.92,260.14,142,72,11,grapefruit


In [3]:
df_fruit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   diameter  10000 non-null  float64
 1   weight    10000 non-null  float64
 2   red       10000 non-null  int64  
 3   green     10000 non-null  int64  
 4   blue      10000 non-null  int64  
 5   name      10000 non-null  object 
dtypes: float64(2), int64(3), object(1)
memory usage: 468.9+ KB


In [4]:
df_fruit.describe() 

Unnamed: 0,diameter,weight,red,green,blue
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,9.975685,175.050792,153.8478,76.0106,11.3632
std,1.947844,29.212119,10.432954,11.708433,9.061275
min,2.96,86.76,115.0,31.0,2.0
25%,8.46,152.22,147.0,68.0,2.0
50%,9.98,174.985,154.0,76.0,10.0
75%,11.48,197.7225,161.0,84.0,17.0
max,16.45,261.51,192.0,116.0,56.0


In [6]:
pd.unique(df_fruit['name'])

array(['orange', 'grapefruit'], dtype=object)

In [7]:
df_fruit['name'].value_counts()

name
orange        5000
grapefruit    5000
Name: count, dtype: int64

## Label Encoder

In [8]:
from sklearn.preprocessing import LabelEncoder as label_encoder
le = label_encoder()
df_fruit['name'] = le.fit_transform(df_fruit['name'])
df_fruit['name'].value_counts()

name
1    5000
0    5000
Name: count, dtype: int64

## Split Atribute

In [9]:
x = df_fruit.drop('name', axis=1)
y = df_fruit['name'] 

## Scalling

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x) 
x = scaler.transform(x) 


## Split Train Test

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=22)

## Training Model

In [13]:
# Inisialisasi model untuk multi-class classification
import xgboost as xgb
model = xgb.XGBClassifier(
    objective='multi:softmax',  # Untuk multi-kelas
    num_class=2,               # Jumlah kelas
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)


model.fit(x_train, y_train)


## Predict Test

In [19]:
# Prediksi data testing
from sklearn.metrics import accuracy_score
y_pred = model.predict(x_test)

# Evaluasi akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

from sklearn.metrics import classification_report 

report = classification_report(y_test, y_pred, target_names=le.classes_)
print("\nLaporan Klasifikasi:\n", report)


Accuracy: 96.67%

Laporan Klasifikasi:
               precision    recall  f1-score   support

  grapefruit       0.96      0.97      0.97       738
      orange       0.97      0.96      0.97       762

    accuracy                           0.97      1500
   macro avg       0.97      0.97      0.97      1500
weighted avg       0.97      0.97      0.97      1500



## New Data

In [20]:
new_data = pd.DataFrame({
    'diameter': [2.96, 4.42],  
    'weight': [86.76, 95.17],  
    'red': [172, 156],         
    'green': [85, 81],        
    'blue': [2, 2]            
})
new_predictions = model.predict(new_data)  
new_name = le.inverse_transform(new_predictions)  

for i, name in enumerate(new_name):
    print(f"Data baru {i+1}: Prediksi Nama = {name}")

Data baru 1: Prediksi Nama = grapefruit
Data baru 2: Prediksi Nama = grapefruit


### dump

In [22]:
import pickle

with open('model_xgb_fruit.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('label_encoder_xgb_fruit.pkl', 'wb') as file:
    pickle.dump(le, file)
