# Classificando se uma pessoa é homem ou mulher apenas pelo peso e altura

Obs: Nenhum desses experimentos servirão para definir nem julgar ninguém. É apenas uma forma de demonstrar uma informação generalista. Por favor, não sinta-se ofendido(a).

#### Dicionário de dados:
Gender -> Gênero<br>
Height -> Altura<br>
Weight -> Peso

In [1]:
# Importando as bibliotecas necessárias

import pandas as pd
import seaborn as sns

In [2]:
# Carregando base de dados e armazenando dentro de uma variável e visualizando as 10 primeiras linhas

df = pd.read_csv(r"C:\Users\jroque\Desktop\Outros\datasets-estudo\weight-height.csv")

df.head(10)

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801
5,Male,67.253016,152.212156
6,Male,68.785081,183.927889
7,Male,68.348516,167.97111
8,Male,67.01895,175.92944
9,Male,63.456494,156.399676


In [3]:
# Verificando informações da coluna

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  10000 non-null  object 
 1   Height  10000 non-null  float64
 2   Weight  10000 non-null  float64
dtypes: float64(2), object(1)
memory usage: 234.5+ KB


In [4]:
# Convertendo a medida de feet para metros e criando uma nova coluna

df['altura'] = df['Height'] * 0.0254

In [5]:
# Visualizando a nova coluna no formato correto

df.head(10)

Unnamed: 0,Gender,Height,Weight,altura
0,Male,73.847017,241.893563,1.875714
1,Male,68.781904,162.310473,1.74706
2,Male,74.110105,212.740856,1.882397
3,Male,71.730978,220.04247,1.821967
4,Male,69.881796,206.349801,1.774998
5,Male,67.253016,152.212156,1.708227
6,Male,68.785081,183.927889,1.747141
7,Male,68.348516,167.97111,1.736052
8,Male,67.01895,175.92944,1.702281
9,Male,63.456494,156.399676,1.611795


In [6]:
# Convertendo weight (pounds e ounces) para Kg

df['peso'] = df['Weight'] * 0.45359237

In [7]:
df.head(10)

Unnamed: 0,Gender,Height,Weight,altura,peso
0,Male,73.847017,241.893563,1.875714,109.721075
1,Male,68.781904,162.310473,1.74706,73.622792
2,Male,74.110105,212.740856,1.882397,96.497629
3,Male,71.730978,220.04247,1.821967,99.809586
4,Male,69.881796,206.349801,1.774998,93.598695
5,Male,67.253016,152.212156,1.708227,69.042272
6,Male,68.785081,183.927889,1.747141,83.428287
7,Male,68.348516,167.97111,1.736052,76.190414
8,Male,67.01895,175.92944,1.702281,79.800252
9,Male,63.456494,156.399676,1.611795,70.9417


In [8]:
# Agora não precisamos mais das colunas Height e Weight, vamos eliminá-las

df.drop(labels= ['Height', 'Weight'], axis= 1, inplace=True)

In [9]:
# Substituindo os nomes em inglês para português

df['Gender'] = df.Gender.map({'Male': 'Masculino', 'Female': 'Feminino'})

In [10]:
# Traduzindo o nome das colunas do dataframe

df.columns= ['genero', 'altura', 'peso']

In [11]:
# Arredondando os valores da altura e peso para 2 casas decimais

df[['altura', 'peso']] = df[['altura', 'peso']].round(2)

In [12]:
# Verificando o dataset

df.head(10)

Unnamed: 0,genero,altura,peso
0,Masculino,1.88,109.72
1,Masculino,1.75,73.62
2,Masculino,1.88,96.5
3,Masculino,1.82,99.81
4,Masculino,1.77,93.6
5,Masculino,1.71,69.04
6,Masculino,1.75,83.43
7,Masculino,1.74,76.19
8,Masculino,1.7,79.8
9,Masculino,1.61,70.94


In [13]:
# Vamos ver se existem homens acima dos 1.90m. Caso sim, vamos visualizar as 10 maiores alturas

df[(df.altura > 1.80) & (df.genero == 'Masculino')].sort_values(by= 'altura' ,ascending=False).head(10)

Unnamed: 0,genero,altura,peso
2014,Masculino,2.01,122.47
3757,Masculino,2.0,111.46
3285,Masculino,1.99,115.16
1317,Masculino,1.99,103.12
994,Masculino,1.98,115.98
4569,Masculino,1.97,109.79
2070,Masculino,1.97,114.56
1922,Masculino,1.97,105.53
4297,Masculino,1.96,109.07
3669,Masculino,1.96,114.04


In [14]:
# Agora vamos ver a mesma situação, mas com o gênero feminino

df[(df.altura > 1.80) & (df.genero == 'Feminino')].sort_values(by= 'altura' ,ascending=False).head(10)

Unnamed: 0,genero,altura,peso
7311,Feminino,1.86,86.22
7162,Feminino,1.84,80.73
7856,Feminino,1.84,85.47
5103,Feminino,1.83,80.45
8635,Feminino,1.83,87.33
8135,Feminino,1.82,84.21
6116,Feminino,1.81,79.39
7924,Feminino,1.81,73.62
8464,Feminino,1.81,82.59
9424,Feminino,1.81,79.85


### Notoriamente os homens têm uma estrutura corporal maior do que a das mulheres. Mas isso não é algo recente, já é um fato comprovado pela ciência.

Vamos agora construir nossa máquina preditiva usando apenas essas duas características do nosso dataset: altura e peso

In [15]:
# Vamos importar nossas bibliotecas de classificação
# Usaremos a lib pycaret para ganharmos tempo na construção do modelo

from pycaret.classification import *

In [16]:
# Iniciando o enviroment

experimento01 = setup(data = df, target = 'genero')

Unnamed: 0,Description,Value
0,session_id,5086
1,Target,genero
2,Target Type,Binary
3,Label Encoded,"Feminino: 0, Masculino: 1"
4,Original Data,"(10000, 3)"
5,Missing Values,False
6,Numeric Features,2
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [17]:
# Vamos ver qual modelo se sai melhor

best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.9181,0.9731,0.9209,0.9167,0.9186,0.8363,0.8366,0.009
lda,Linear Discriminant Analysis,0.9177,0.9731,0.92,0.9167,0.9182,0.8354,0.8357,0.007
gbc,Gradient Boosting Classifier,0.9176,0.971,0.9152,0.9203,0.9176,0.8351,0.8354,0.101
ridge,Ridge Classifier,0.9174,0.0,0.9186,0.9173,0.9178,0.8348,0.8351,0.006
lightgbm,Light Gradient Boosting Machine,0.9164,0.9691,0.9206,0.914,0.9171,0.8328,0.8332,0.205
ada,Ada Boost Classifier,0.913,0.9703,0.9061,0.9197,0.9127,0.826,0.8264,0.057
lr,Logistic Regression,0.9113,0.9716,0.9144,0.9096,0.9118,0.8225,0.8228,0.817
knn,K Neighbors Classifier,0.9041,0.95,0.903,0.9061,0.9043,0.8083,0.8086,0.02
svm,SVM - Linear Kernel,0.8996,0.0,0.9109,0.8947,0.9008,0.7991,0.8029,0.014
nb,Naive Bayes,0.8853,0.9561,0.8848,0.8868,0.8856,0.7705,0.7709,0.007


In [19]:
# O algoritmo Quadratic Discriminant Analysis se saiu melhor. Usaremos ele para construir nosso modelo.

qda_model = create_model('qda')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9043,0.9656,0.9231,0.8901,0.9063,0.8085,0.8091
1,0.9171,0.9763,0.9345,0.9036,0.9188,0.8343,0.8348
2,0.9186,0.9708,0.9088,0.9273,0.918,0.8372,0.8373
3,0.9214,0.9752,0.9316,0.9134,0.9224,0.8428,0.843
4,0.9086,0.9661,0.9117,0.9065,0.9091,0.8171,0.8172
5,0.92,0.9758,0.9091,0.9302,0.9195,0.84,0.8402
6,0.9343,0.9815,0.9261,0.9422,0.9341,0.8686,0.8687
7,0.9114,0.9687,0.9034,0.9191,0.9112,0.8229,0.823
8,0.91,0.9687,0.9006,0.9188,0.9096,0.82,0.8202
9,0.9356,0.9827,0.9601,0.9158,0.9374,0.8712,0.8722


In [20]:
# Agora vamos avaliar a performance do nosso modelo

evaluate_model(qda_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [21]:
# Agora vamos testar nosso modelo com os dados de teste que o pycaret separou anteriormente

predict_model(qda_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Quadratic Discriminant Analysis,0.9214,0.9742,0.9226,0.9189,0.9208,0.8427,0.8427


Unnamed: 0,altura,peso,genero,Label,Score
0,1.82,87.709999,Masculino,Masculino,0.9794
1,1.76,84.510002,Masculino,Masculino,0.9703
2,1.70,82.370003,Feminino,Masculino,0.9747
3,1.74,82.019997,Masculino,Masculino,0.9414
4,1.84,85.809998,Masculino,Masculino,0.9404
...,...,...,...,...,...
2996,1.72,75.419998,Masculino,Masculino,0.5789
2997,1.65,64.949997,Feminino,Feminino,0.9467
2998,1.55,50.500000,Feminino,Feminino,0.9992
2999,1.63,60.040001,Feminino,Feminino,0.9902


In [22]:
# Com o nosso modelo criado, vamos salvá-lo para criar nosso data app no streamlit

save_model(qda_model, model_name = 'qda_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='genero',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('cluster_all', 'passthrough'),
                 ('dummy', Dummify(target='genero')),
                 ('fix_perfect', Remove_100(target='genero')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), (