In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix 

In [47]:
df= sns.load_dataset('penguins')
df=df.dropna()
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


VARIÁVEL RESPOSTA

In [48]:

variavel_c= (df['species'],df['island'],df['sex'])
df=df.drop(columns=['species','island','sex'])

In [49]:
mean_values= df.mean()
std_values= df.std()

In [50]:
df['species'],df['island'],df['sex']= variavel_c

In [35]:
df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species,island,sex
0,39.1,18.7,181.0,3750.0,Adelie,Torgersen,Male
1,39.5,17.4,186.0,3800.0,Adelie,Torgersen,Female
2,40.3,18.0,195.0,3250.0,Adelie,Torgersen,Female
4,36.7,19.3,193.0,3450.0,Adelie,Torgersen,Female
5,39.3,20.6,190.0,3650.0,Adelie,Torgersen,Male
...,...,...,...,...,...,...,...
338,47.2,13.7,214.0,4925.0,Gentoo,Biscoe,Female
340,46.8,14.3,215.0,4850.0,Gentoo,Biscoe,Female
341,50.4,15.7,222.0,5750.0,Gentoo,Biscoe,Male
342,45.2,14.8,212.0,5200.0,Gentoo,Biscoe,Female


## VARIÁVEIS NUMÉRICAS

In [51]:
df[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']].head() #filtrando as culms numéricas

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
4,36.7,19.3,193.0,3450.0
5,39.3,20.6,190.0,3650.0


In [52]:
# escala min / max
min1 = df['bill_depth_mm'].min()
max1 = df['bill_depth_mm'].max()
print(f'bill_depth_mm min: {min1}')
print(f'bill_depth_mm max: {max1}')

min2 = df['bill_length_mm'].min()
max2 = df['bill_length_mm'].max()
print(f'bill_length_mm min: {min2}')
print(f'bill_length_mm max: {max2}')

min3 = df['flipper_length_mm'].min()
max3 = df['flipper_length_mm'].max()
print(f'flipper_length_mm min: {min3}')
print(f'flipper_length_mm max: {max3}')

min4 = df['body_mass_g'].min()
max4 = df['body_mass_g'].max()
print(f'body_mass_g min: {min4}')
print(f'body_mass_g max: {max4}')

bill_depth_mm min: 13.1
bill_depth_mm max: 21.5
bill_length_mm min: 32.1
bill_length_mm max: 59.6
flipper_length_mm min: 172.0
flipper_length_mm max: 231.0
body_mass_g min: 2700.0
body_mass_g max: 6300.0


In [53]:
# criando uma nova coluna padronizando seus valores. A nova coluna tem o mesmo nome da coluna original acrescida de "_std" e fazendo o calculo de normalização para reduzir a escala
df['bill_depth_mm_std'] = df['bill_depth_mm'].apply(lambda x: (x-min1)/(max1-min1))
df['bill_length_mm_std'] = df['bill_length_mm'].apply(lambda x: (x-min2)/(max2-min2))
df['flipper_length_mm_std'] = df['flipper_length_mm'].apply(lambda x: (x-min3)/(max3-min3))
df['body_mass_g_std']=df['body_mass_g'].apply(lambda x: (x-min4)/(max4-min4))

# padronizando a média e o desvio padrão
# de uma forma menos trabalhosa 
variaveis_numericas = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] # criando uma var. 
for variavel in variaveis_numericas: #loop
    coluna_std = f'{variavel}_std' # criando as colunas com o _std no final 
    df[coluna_std] = (df[variavel] - df[variavel].mean()) / df[variavel].std() # realizando o processo de padronização dos valores da variável numérica.




In [39]:
df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species,island,sex,bill_depth_mm_std,bill_length_mm_std,flipper_length_mm_std,body_mass_g_std
0,39.1,18.7,181.0,3750.0,Adelie,Torgersen,Male,0.779559,-0.894695,-1.424608,-0.567621
1,39.5,17.4,186.0,3800.0,Adelie,Torgersen,Female,0.119404,-0.821552,-1.067867,-0.505525
2,40.3,18.0,195.0,3250.0,Adelie,Torgersen,Female,0.424091,-0.675264,-0.425733,-1.188572
4,36.7,19.3,193.0,3450.0,Adelie,Torgersen,Female,1.084246,-1.333559,-0.568429,-0.940192
5,39.3,20.6,190.0,3650.0,Adelie,Torgersen,Male,1.744400,-0.858123,-0.782474,-0.691811
...,...,...,...,...,...,...,...,...,...,...,...
338,47.2,13.7,214.0,4925.0,Gentoo,Biscoe,Female,-1.759497,0.586470,0.929884,0.891616
340,46.8,14.3,215.0,4850.0,Gentoo,Biscoe,Female,-1.454811,0.513326,1.001232,0.798473
341,50.4,15.7,222.0,5750.0,Gentoo,Biscoe,Male,-0.743875,1.171621,1.500670,1.916186
342,45.2,14.8,212.0,5200.0,Gentoo,Biscoe,Female,-1.200905,0.220750,0.787187,1.233139


VARIÁVEIS CATEGÓRICAS

In [54]:
valores_unicos_species = df['island'].unique()
valores_unicos_species

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [55]:
df['island_T_nom'] = df['island'].apply(lambda species: 1 if species == 'Torgersen' else 0)
df['island_B_nom'] = df['island'].apply(lambda species: 1 if species == 'Biscoe' else 0)
df['island_D_nom'] = df['island'].apply(lambda species: 1 if species == 'Dream' else 0)

In [56]:
valores_unicos_species = df['sex'].unique()
valores_unicos_species

array(['Male', 'Female'], dtype=object)

In [57]:
df['sex_M_nom'] = df['sex'].apply(lambda sex: 1 if sex == 'Male' else 0)
df['sex_F_nom'] = df['sex'].apply(lambda sex: 1 if sex == 'Female' else 0)

In [44]:
df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species,island,sex,bill_depth_mm_std,bill_length_mm_std,flipper_length_mm_std,body_mass_g_std,island_T_nom,island_B_nom,island_D_nom,sex_M_nom,sex_F_nom
0,39.1,18.7,181.0,3750.0,Adelie,Torgersen,Male,0.779559,-0.894695,-1.424608,-0.567621,1,0,0,1,0
1,39.5,17.4,186.0,3800.0,Adelie,Torgersen,Female,0.119404,-0.821552,-1.067867,-0.505525,1,0,0,0,1
2,40.3,18.0,195.0,3250.0,Adelie,Torgersen,Female,0.424091,-0.675264,-0.425733,-1.188572,1,0,0,0,1
4,36.7,19.3,193.0,3450.0,Adelie,Torgersen,Female,1.084246,-1.333559,-0.568429,-0.940192,1,0,0,0,1
5,39.3,20.6,190.0,3650.0,Adelie,Torgersen,Male,1.744400,-0.858123,-0.782474,-0.691811,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,47.2,13.7,214.0,4925.0,Gentoo,Biscoe,Female,-1.759497,0.586470,0.929884,0.891616,0,1,0,0,1
340,46.8,14.3,215.0,4850.0,Gentoo,Biscoe,Female,-1.454811,0.513326,1.001232,0.798473,0,1,0,0,1
341,50.4,15.7,222.0,5750.0,Gentoo,Biscoe,Male,-0.743875,1.171621,1.500670,1.916186,0,1,0,1,0
342,45.2,14.8,212.0,5200.0,Gentoo,Biscoe,Female,-1.200905,0.220750,0.787187,1.233139,0,1,0,0,1


LIMPEZA

In [58]:
# descartando as colunas originais e renomeando a coluna species

df.rename(columns={'species': 'species_nom'}, inplace=True)

colunas_originais = [coluna for coluna in df.columns if not (coluna.endswith(('_nom','_std','_ord')))] # colunas_originais é usada para identificar todas as colunas que não possuem o sufixo "_nom". 

df = df.drop(columns=colunas_originais) # método drop() para excluir as colunas originais, deixando as colunas com o final _nom


df.head(2)

Unnamed: 0,species_nom,bill_depth_mm_std,bill_length_mm_std,flipper_length_mm_std,body_mass_g_std,island_T_nom,island_B_nom,island_D_nom,sex_M_nom,sex_F_nom
0,Adelie,0.779559,-0.894695,-1.424608,-0.567621,1,0,0,1,0
1,Adelie,0.119404,-0.821552,-1.067867,-0.505525,1,0,0,0,1


MÉDIA e DESVIO PADRÃO

In [164]:
mean_values= df.mean()
std_values= df.std()

TREINO

In [59]:
# Dividir os dados em recursos (X) e variável de destino (y)
X = df[['bill_depth_mm_std', 'bill_length_mm_std', 'flipper_length_mm_std', 'body_mass_g_std', 
        'island_T_nom', 'island_B_nom', 'island_D_nom', 'sex_M_nom', 'sex_F_nom']]
y = df['species_nom']

# TREINO E TESTE
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

#iniciando o Modelo
modelo= RandomForestClassifier(random_state=42)

# TREINO
modelo.fit(X_train, y_train)
# Previsao no conjunto de teste
y_pred= modelo.predict(X_test)

#Avaliação
precisao= accuracy_score(y_test, y_pred)
print(f'Previsão do Modelo: {precisao}')


Previsão do Modelo: 1.0


AVALIAÇÃO

In [60]:
# Calcular a matriz de confusão
matriz_confusao = confusion_matrix(y_test, y_pred)

# Calcular as métricas de classificação (precisão, recall, F1-score)
relatorio_classificacao = classification_report(y_test, y_pred)

# Imprimir a matriz de confusão e o relatório de classificação
print("Matriz de Confusão:")
print(matriz_confusao)
print("\nRelatório de Classificação:")
print(relatorio_classificacao)

Matriz de Confusão:
[[31  0  0]
 [ 0 13  0]
 [ 0  0 23]]

Relatório de Classificação:
              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        31
   Chinstrap       1.00      1.00      1.00        13
      Gentoo       1.00      1.00      1.00        23

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67



PREVISÕES

In [66]:
# dados do novo pinguin

novo_island = input("Digite a ilha do novo pinguim (Torgersen, Biscoe, Dream): ")
novo_depth = float(input("Digite a profundidade do bico do novo pinguim: "))
novo_length = float(input("Digite o comprimento do bico do novo pinguim: "))
novo_flipper = float(input("Digite o comprimento da nadadeira do novo pinguim: "))
novo_massa = float(input("Digite a massa do corpo do novo pinguim: "))
novo_sex = input("Digite o sexo do novo pinguim (Male, Female): ")

#normalizando os novos dados numéricos
#médias e desvios padrão do novo conjunto de treinamento
novo_depth_std = (novo_depth - mean_values['bill_depth_mm']) / std_values['bill_depth_mm']
novo_length_std = (novo_length - mean_values['bill_length_mm']) / std_values['bill_length_mm']
novo_flipper_std = (novo_flipper - mean_values['flipper_length_mm']) / std_values['flipper_length_mm']
novo_massa_std = (novo_massa - mean_values['body_mass_g']) / std_values['body_mass_g']

# Codificação one-hot para a ilha (island)
if novo_island == 'Torgersen':
    novo_island_T = 1
    novo_island_B = 0
    novo_island_D = 0
elif novo_island == 'Biscoe':
    novo_island_T = 0
    novo_island_B = 1
    novo_island_D = 0
elif novo_island == 'Dream':
    novo_island_T = 0
    novo_island_B = 0
    novo_island_D = 1

# Codificação one-hot para o sexo (sex)
if novo_sex == 'Male':
    novo_sex_M = 1
    novo_sex_F = 0
elif novo_sex == 'Female':
    novo_sex_M = 0
    novo_sex_F = 1

# Fazer previsões nos novos dados
nova_previsao = modelo.predict([[novo_depth_std, novo_length_std, novo_flipper_std, novo_massa_std,
                                 novo_island_T, novo_island_B, novo_island_D, novo_sex_M, novo_sex_F]])

print("Previsão para o Novo Pinguim:", nova_previsao[0])

ValueError: could not convert string to float: 'yt'