## Importação dos dados

In [10]:
import pandas as pd

# Carregar os dados do arquivo CSV
file_path = './bodyPerformance.csv'
df = pd.read_csv(file_path)

# Exibir as primeiras linhas do DataFrame para inspecionar os dados
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


## Observando mais o dataset

In [11]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13298 entries, 0 to 13297
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13298 non-null  float64
 1   gender                   13298 non-null  object 
 2   height_cm                13298 non-null  float64
 3   weight_kg                13298 non-null  float64
 4   body fat_%               13298 non-null  float64
 5   diastolic                13298 non-null  float64
 6   systolic                 13298 non-null  float64
 7   gripForce                13298 non-null  float64
 8   sit and bend forward_cm  13298 non-null  float64
 9   sit-ups counts           13298 non-null  float64
 10  broad jump_cm            13298 non-null  float64
 11  class                    13298 non-null  object 
dtypes: float64(10), object(2)
memory usage: 1.2+ MB


Unnamed: 0,age,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm
count,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0
mean,36.781095,168.563025,67.457663,23.244159,78.800504,130.2386,36.970214,15.215844,39.763122,190.137773
std,13.629747,8.432889,11.955535,7.256725,10.751409,14.723199,10.623391,8.452996,14.280259,39.870227
min,21.0,125.0,26.3,3.0,0.0,0.0,0.0,-25.0,0.0,0.0
25%,25.0,162.4,58.2,18.0,71.0,120.0,27.5,10.9,30.0,162.0
50%,32.0,169.2,67.4,22.8,79.0,130.0,37.9,16.2,41.0,193.0
75%,48.0,174.8,75.39,28.0,86.0,141.0,45.2,20.8,50.0,221.0
max,64.0,193.8,138.1,78.4,156.2,201.0,70.5,213.0,80.0,303.0


## Preparação de colunas não númericas

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['class'] = label_encoder.fit_transform(df['class'])

## Separação de entrada e saída

In [13]:
X = df.drop('class', axis=1)
y = df['class']

Entrada

In [14]:
X.describe()


Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm
count,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0,13298.0
mean,36.781095,0.632125,168.563025,67.457663,23.244159,78.800504,130.2386,36.970214,15.215844,39.763122,190.137773
std,13.629747,0.482245,8.432889,11.955535,7.256725,10.751409,14.723199,10.623391,8.452996,14.280259,39.870227
min,21.0,0.0,125.0,26.3,3.0,0.0,0.0,0.0,-25.0,0.0,0.0
25%,25.0,0.0,162.4,58.2,18.0,71.0,120.0,27.5,10.9,30.0,162.0
50%,32.0,1.0,169.2,67.4,22.8,79.0,130.0,37.9,16.2,41.0,193.0
75%,48.0,1.0,174.8,75.39,28.0,86.0,141.0,45.2,20.8,50.0,221.0
max,64.0,1.0,193.8,138.1,78.4,156.2,201.0,70.5,213.0,80.0,303.0


Saída

In [15]:
y.describe()

count    13298.000000
mean         1.500000
std          1.118009
min          0.000000
25%          1.000000
50%          1.000000
75%          2.750000
max          3.000000
Name: class, dtype: float64

## Sepração de teste e treinamento

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)
num_particoes = 5
kfold = StratifiedKFold(n_splits=num_particoes, shuffle=True, random_state=seed)

## Comparação de modelos

Comparação da acurácia e desvio padrão

In [17]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler


pipelines = []
results = []
names = []

knn = ('KNN', KNeighborsClassifier())
tree = ('TREE', DecisionTreeClassifier(random_state=seed))
naive_bayes = ('NB', GaussianNB())
svm = ('SVM', SVC(random_state=seed))

standard_scaler = ('StandardScaler', StandardScaler())
min_max_scaler = ('MinMaxScaler', MinMaxScaler())

# Original
pipelines.append(('KNN-orig', Pipeline([knn])))
pipelines.append(('TREE-orig', Pipeline([tree])))
pipelines.append(('NB-orig', Pipeline([naive_bayes])))
pipelines.append(('SVM-orig', Pipeline([svm])))

# Padronizado
pipelines.append(('KNN-padr', Pipeline([standard_scaler, knn])))
pipelines.append(('TREE-padr', Pipeline([standard_scaler, tree])))
pipelines.append(('NB-padr', Pipeline([standard_scaler, naive_bayes])))
pipelines.append(('SVM-padr', Pipeline([standard_scaler, svm])))

# Normalizado
pipelines.append(('KNN-norm', Pipeline([min_max_scaler, knn])))
pipelines.append(('TREE-norm', Pipeline([min_max_scaler, tree])))
pipelines.append(('NB-norm', Pipeline([min_max_scaler, naive_bayes])))
pipelines.append(('SVM-norm', Pipeline([min_max_scaler, svm])))

for name, model in pipelines:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})" 
    print(msg)


KNN-orig: 0.550 (+/- 0.009)
TREE-orig: 0.631 (+/- 0.007)
NB-orig: 0.542 (+/- 0.007)
SVM-orig: 0.607 (+/- 0.008)
KNN-padr: 0.585 (+/- 0.007)
TREE-padr: 0.631 (+/- 0.005)
NB-padr: 0.542 (+/- 0.007)
SVM-padr: 0.687 (+/- 0.005)
KNN-norm: 0.551 (+/- 0.029)
TREE-norm: 0.631 (+/- 0.006)
NB-norm: 0.542 (+/- 0.007)
SVM-norm: 0.630 (+/- 0.021)


## Explorando mais alguns modelos

Foram selecionados os modelos:
- TREE - padronizado
- SMV - padronizado

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid_tree = {
    'model__max_depth': [None, 10, 20, 30],
    'model__criterion': ['gini', 'entropy', 'log_loss'],
    'model__max_features': ['sqrt', 'log2'],
}


param_grid_svm = {
    'model__C': [100, 1000],
    'model__gamma': [0.1, 0.01],
    'model__decision_function_shape': ['ovo', 'ovr'],
}

# GridSearch para cada modelo
grid_tree = GridSearchCV(Pipeline([('scaler', StandardScaler()), ('model', DecisionTreeClassifier(random_state=seed))]), param_grid_tree, cv=kfold, verbose=1)
grid_svm = GridSearchCV(Pipeline([('scaler', StandardScaler()), ('model', SVC(random_state=seed))]), param_grid_svm, cv=kfold, verbose=1)
# Ajustar e encontrar os melhores parâmetros
print("Fitting models...")
print("Fitting TREE...")
grid_tree.fit(X_train, y_train)
print("Fitting SVM...")
grid_svm.fit(X_train, y_train)

# Exibir os melhores parâmetros
print("Best SVM Params:", grid_svm.best_params_)
print("Best TREE Params:", grid_tree.best_params_)


Fitting models...
Fitting TREE...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting SVM...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best SVM Params: {'model__C': 1000, 'model__decision_function_shape': 'ovo', 'model__gamma': 0.01}
Best TREE Params: {'model__criterion': 'entropy', 'model__max_depth': 10, 'model__max_features': 'sqrt'}


### Comparação com model não otimizado

In [19]:
pipelines = []
results = []
names = []

tree = ('TREE', DecisionTreeClassifier(random_state=seed))
svm = ('SVM', SVC(random_state=seed))
tree_otm = ('TREE_OTM', DecisionTreeClassifier(random_state=seed, criterion='entropy', max_depth=10, max_features='sqrt'))
svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))

standard_scaler = ('StandardScaler', StandardScaler())

pipelines.append(('TREE-padr', Pipeline([standard_scaler, tree])))
pipelines.append(('SVM-padr', Pipeline([standard_scaler, svm])))
pipelines.append(('TREE-padr-otm', Pipeline([standard_scaler, tree_otm])))
pipelines.append(('SVM-padr-otm', Pipeline([standard_scaler, svm_otm])))


for name, model in pipelines:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})" 
    print(msg)

TREE-padr: 0.631 (+/- 0.005)
SVM-padr: 0.687 (+/- 0.005)
TREE-padr-otm: 0.607 (+/- 0.017)
SVM-padr-otm: 0.706 (+/- 0.007)


## Tratamento de outliers

In [20]:
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(df.select_dtypes(include=np.number)))
df_clean = df[(z_scores < 3).all(axis=1)]

print(f"Removidos {df.shape[0] - df_clean.shape[0]} outliers")
df = df_clean

Removidos 339 outliers


In [21]:
# Refazendo a separação das features e target e a divisão entre treino e teste
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

In [22]:
# Nova avaliação do modelo SVM com os dados limpos
svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

SVM-padr-otm: 0.708 (+/- 0.002)


Sem ganho

## Verificação de drop de coluna

In [23]:
# sem sit and bend forward_cm
df_drop = df.drop(['sit and bend forward_cm'], axis=1)
print(df_drop.head())

X = df_drop.drop('class', axis=1)
y = df_drop['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit-ups counts  broad jump_cm  class  
0       54.9            60.0          217.0      2  
1       36.4            53.0          229.0      0  
2       44.8            49.0          181.0      2  
3       41.4            53.0          219.0      1  
4       43.5            45.0          217.0      1  
SVM-padr-otm: 0.594 (+/- 0.016)


In [24]:
# sem broad jump_cm
df_drop = df.drop(['broad jump_cm'], axis=1)
print(df_drop.head())

X = df_drop.drop('class', axis=1)
y = df_drop['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  sit-ups counts  class  
0       54.9                     18.4            60.0      2  
1       36.4                     16.3            53.0      0  
2       44.8                     12.0            49.0      2  
3       41.4                     15.2            53.0      1  
4       43.5                     27.1            45.0      1  
SVM-padr-otm: 0.703 (+/- 0.004)


In [25]:
# sem gripForce
df_drop = df.drop(['gripForce'], axis=1)
print(df_drop.head())

X = df_drop.drop('class', axis=1)
y = df_drop['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   sit and bend forward_cm  sit-ups counts  broad jump_cm  class  
0                     18.4            60.0          217.0      2  
1                     16.3            53.0          229.0      0  
2                     12.0            49.0          181.0      2  
3                     15.2            53.0          219.0      1  
4                     27.1            45.0          217.0      1  
SVM-padr-otm: 0.693 (+/- 0.004)


In [26]:
# sem sit-ups counts
df_drop = df.drop(['sit-ups counts'], axis=1)
print(df_drop.head())

X = df_drop.drop('class', axis=1)
y = df_drop['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  broad jump_cm  class  
0       54.9                     18.4          217.0      2  
1       36.4                     16.3          229.0      0  
2       44.8                     12.0          181.0      2  
3       41.4                     15.2          219.0      1  
4       43.5                     27.1          217.0      1  
SVM-padr-otm: 0.627 (+/- 0.004)


In [27]:
# sem sit and bend forward_cm
df_drop = df.drop(['sit and bend forward_cm', 'broad jump_cm'], axis=1)
print(df_drop.head())

X = df_drop.drop('class', axis=1)
y = df_drop['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit-ups counts  class  
0       54.9            60.0      2  
1       36.4            53.0      0  
2       44.8            49.0      2  
3       41.4            53.0      1  
4       43.5            45.0      1  
SVM-padr-otm: 0.577 (+/- 0.012)


In [28]:
# sem sit and bend forward_cm
df_drop = df.drop(['sit and bend forward_cm', 'broad jump_cm', 'gripForce'], axis=1)
print(df_drop.head())

X = df_drop.drop('class', axis=1)
y = df_drop['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   sit-ups counts  class  
0            60.0      2  
1            53.0      0  
2            49.0      2  
3            53.0      1  
4            45.0      1  
SVM-padr-otm: 0.548 (+/- 0.011)


Sem ganho

## Ajustando as labels

In [29]:
df = pd.read_csv(file_path)
df['gender'] = df['gender'].map({'M': 1, 'F': 2})
df['class'] = df['class'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4})
print(df.head())

X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True, stratify=y)

svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  sit-ups counts  broad jump_cm  class  
0       54.9                     18.4            60.0          217.0      3  
1       36.4                     16.3            53.0          229.0      1  
2       44.8                     12.0            49.0          181.0      3  
3       41.4                     15.2            53.0          219.0      2  
4       43.5                     27.1            45.0          217.0      2  
SVM-padr-otm: 0.706 (+/- 0.007)


## Ajustando porcentagem de treino

In [30]:
df = pd.read_csv(file_path)
df['gender'] = df['gender'].map({'M': 1, 'F': 2})
df['class'] = df['class'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4})
print(df.head())
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed, shuffle=True, stratify=y)


svm_otm = ('SVM_OTM', SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01))
pipeline = Pipeline([standard_scaler, svm_otm])
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
msg = f"{name}: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})"
print(msg)

    age  gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0       1      172.3      75.24        21.3       80.0     130.0   
1  25.0       1      165.0      55.80        15.7       77.0     126.0   
2  31.0       1      179.6      78.00        20.1       92.0     152.0   
3  32.0       1      174.5      71.10        18.4       76.0     147.0   
4  28.0       1      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  sit-ups counts  broad jump_cm  class  
0       54.9                     18.4            60.0          217.0      3  
1       36.4                     16.3            53.0          229.0      1  
2       44.8                     12.0            49.0          181.0      3  
3       41.4                     15.2            53.0          219.0      2  
4       43.5                     27.1            45.0          217.0      2  
SVM-padr-otm: 0.706 (+/- 0.005)


# Finalização

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv(file_path)
df['gender'] = df['gender'].map({'M': 1, 'F': 2})
df['class'] = df['class'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4})

# Amostragem aleatória que não passará pelo modelo e será usado para avaliar a generalização do modelo
sampled_df = df.sample(n=394, random_state=42)

df_final = df.drop(sampled_df.index)

X = df_final.drop('class', axis=1)
y = df_final['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed, shuffle=True, stratify=y)

standard_scaler = StandardScaler().fit(X_train)
X_train_scaled = standard_scaler.transform(X_train)
model = SVC(random_state=seed, C=1000, decision_function_shape='ovo', gamma=0.01)
model.fit(X_train_scaled, y_train)

X_test_scaled = standard_scaler.transform(X_test)
y_pred = model.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

rescaledX = standard_scaler.transform(X)
print(rescaledX)
model.fit(rescaledX, y)

y_pred_sample = model.predict(standard_scaler.transform(sampled_df.drop('class', axis=1)))
sampled_df['class_pred'] = y_pred_sample
print(sampled_df[['class', 'class_pred']])
print(accuracy_score(sampled_df['class'], sampled_df['class_pred']))

0.7154370737755734
              precision    recall  f1-score   support

           1       0.70      0.87      0.77       808
           2       0.58      0.57      0.57       803
           3       0.70      0.64      0.67       807
           4       0.92      0.78      0.84       808

    accuracy                           0.72      3226
   macro avg       0.72      0.72      0.72      3226
weighted avg       0.72      0.72      0.72      3226

[[-0.72060173 -0.76739928  0.44865209 ...  0.37332193  1.42786734
   0.68824661]
 [-0.86708476 -0.76739928 -0.41761104 ...  0.12818012  0.93679556
   0.99046547]
 [-0.42763568 -0.76739928  1.31491521 ... -0.3737769   0.65618312
  -0.21840996]
 ...
 [-0.79384324 -0.76739928  1.16064918 ...  0.16320038  1.28756112
   1.06602018]
 [ 1.98933423 -0.76739928  0.74531754 ... -0.54887818 -1.09764466
  -0.49544391]
 [ 0.89071154  1.3031026  -1.41440696 ...  1.2721752  -0.46626666
  -1.37691557]]
       class  class_pred
12586      3           3
1087

## Exportação do modelo

In [32]:
import joblib

best_model = model
joblib.dump(best_model, './best_model.pkl')
joblib.dump(standard_scaler, './scaler.pkl')

['./scaler.pkl']

## Explorando mais os dados

### Pós processamento

In [33]:
import pandas as pd

file_path = './bodyPerformance.csv'
df = pd.read_csv(file_path)
df['gender'] = df['gender'].map({'M': 1, 'F': 2})
df['class'] = df['class'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4})

# Definir a faixa etária para categorização (exemplo)
def categorize_age(age):
    if age < 20:
        return 'Menor de 20'
    elif 20 <= age < 30:
        return '20-29'
    elif 30 <= age < 40:
        return '30-39'
    elif 40 <= age < 50:
        return '40-49'
    else:
        return '50 ou mais'

# Adicionar coluna de faixa etária
df['age_group'] = df['age'].apply(categorize_age)

# Função para calcular a média de classificação
def calculate_mean_classification(df, height_threshold, gender):
    # Filtrar os dados
    filtered_df = df[(df['height_cm'] >= height_threshold) & (df['gender'] == gender)]
    
    # Agrupar por faixa etária e calcular a média da classificação
    mean_classification = filtered_df.groupby('age_group')['class'].mean().reset_index()
    mean_classification['class'] = mean_classification['class'].round().astype(int)
    return mean_classification

# Parâmetros de entrada
height_threshold = 170  # Altura mínima para o filtro
gender = 2  # Gênero para o filtro ( 1->'M' ou 2->'F')

# Calcular a média de classificação
mean_classification = calculate_mean_classification(df, height_threshold, gender)

# Mostrar o resultado
print(mean_classification)


    age_group  class
0       20-29      2
1       30-39      2
2       40-49      3
3  50 ou mais      3
