We're going to do some dimension reduction, so let's pip install UMAP.

In [1]:
!pip install --quiet umap-learn

In [2]:
import pandas as pd

XLSX = '/kaggle/input/obesity-dataset/Obesity_Dataset.xlsx'

df = pd.read_excel(io=XLSX, sheet_name='Obesity_Dataset ', )
df['Class'] = df['Class'].map({1: 'A', 2: 'B', 3: 'C', 4: 'D'})
df.head()

Unnamed: 0,Sex,Age,Height,Overweight_Obese_Family,Consumption_of_Fast_Food,Frequency_of_Consuming_Vegetables,Number_of_Main_Meals_Daily,Food_Intake_Between_Meals,Smoking,Liquid_Intake_Daily,Calculation_of_Calorie_Intake,Physical_Excercise,Schedule_Dedicated_to_Technology,Type_of_Transportation_Used,Class
0,2,18,155,2,2,3,1,3,2,1,2,3,3,4,B
1,2,18,158,2,2,3,1,1,2,1,2,1,3,3,B
2,2,18,159,2,2,2,1,3,2,3,2,2,3,4,B
3,2,18,162,2,2,2,2,2,2,2,2,1,3,4,B
4,2,18,165,2,1,2,1,3,2,1,2,3,3,2,B


In [3]:
df['Class'].value_counts().to_dict()

{'B': 658, 'C': 592, 'D': 287, 'A': 73}

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df.drop(columns=['Class']))
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:13.290516


In [5]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='Class', height=800)

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['x', 'y', 'Class']), df['Class'], test_size=0.2, random_state=2024, stratify=df['Class'])

svc = SVC(kernel='linear', C=0.025, random_state=2024).fit(X_train, y_train)
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=svc.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=svc.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=svc.predict(X=X_test), zero_division=0))

accuracy: 0.7609
f1: 0.7417
              precision    recall  f1-score   support

           A       1.00      0.07      0.12        15
           B       0.77      0.91      0.83       132
           C       0.74      0.78      0.76       118
           D       0.80      0.56      0.66        57

    accuracy                           0.76       322
   macro avg       0.83      0.58      0.59       322
weighted avg       0.77      0.76      0.74       322

