# New Model

**AIMS**

Se realiza:
- Limpieza de datos.
- Selección de variables.
- Ajuste y guardado de modelos reducidos.
- Creación de funciones para pruebas de modelos.

In [1]:
# Modulos a usar
import pandas as pd
import pickle
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score

# encoder = OrdinalEncoder()

## 1. Limpieza de datos al archivo más grande

Como conclusión es una perdida de tiempo por que solo son necesarias 3 variables.

In [2]:
df=pd.read_csv('data/penguins_lter.csv')
df = df.rename(columns=lambda x: x.replace(' ', '_').lower())
df.rename(columns={'culmen_length_(mm)':'culmen_length', 
                   'culmen_depth_(mm)':'culmen_depth',
                   'flipper_length_(mm)':'flipper_length',
                   'body_mass_(g)':'body_mass',
                   'delta_15_n_(o/oo)':'delta_15',
                   'delta_13_c_(o/oo)':'delta_13'
                   }, inplace=True)


df = df.drop(['region', 'stage'], axis=1)

## Variable salida
df["species"] = LabelEncoder().fit_transform(df.species)

# Otras variables label
df["individual_id"] = LabelEncoder().fit_transform(df.individual_id)
df["clutch_completion"] = LabelEncoder().fit_transform(df.clutch_completion)
df["date_egg"] = LabelEncoder().fit_transform(df.date_egg)
df["sex"] = LabelEncoder().fit_transform(df.sex)
df["comments"] = LabelEncoder().fit_transform(df.comments)

# OHE
one_hot_encoded = pd.get_dummies(df['island'], prefix='island', drop_first=True)
df = pd.concat([df, one_hot_encoded], axis=1)
df["island_Dream"] = LabelEncoder().fit_transform(df.island_Dream)
df["island_Torgersen"] = LabelEncoder().fit_transform(df.island_Torgersen)

one_hot_encoded = pd.get_dummies(df['studyname'], prefix='studyname', drop_first=True)
df = pd.concat([df, one_hot_encoded], axis=1)
df["studyname_PAL0809"] = LabelEncoder().fit_transform(df.studyname_PAL0809)
df["studyname_PAL0910"] = LabelEncoder().fit_transform(df.studyname_PAL0910)

df = df.drop(['studyname','island'], axis=1)

### imputar datos
imputer = KNNImputer(n_neighbors = 3)
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

df.head()

Unnamed: 0,sample_number,species,individual_id,clutch_completion,date_egg,culmen_length,culmen_depth,flipper_length,body_mass,sex,delta_15,delta_13,comments,island_Dream,island_Torgersen,studyname_PAL0809,studyname_PAL0910
0,1.0,0.0,22.0,1.0,3.0,39.1,18.7,181.0,3750.0,2.0,9.270653,-25.00928,5.0,0.0,1.0,0.0,0.0
1,2.0,0.0,23.0,1.0,3.0,39.5,17.4,186.0,3800.0,1.0,8.94956,-24.69454,7.0,0.0,1.0,0.0,0.0
2,3.0,0.0,44.0,1.0,15.0,40.3,18.0,195.0,3250.0,1.0,8.36821,-25.33302,7.0,0.0,1.0,0.0,0.0
3,4.0,0.0,45.0,1.0,15.0,44.8,15.333333,206.666667,4400.0,3.0,8.199337,-25.401957,0.0,0.0,1.0,0.0,0.0
4,5.0,0.0,66.0,1.0,15.0,36.7,19.3,193.0,3450.0,1.0,8.76651,-25.32426,7.0,0.0,1.0,0.0,0.0


In [3]:
print(df.shape)
df.info()

(344, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sample_number      344 non-null    float64
 1   species            344 non-null    float64
 2   individual_id      344 non-null    float64
 3   clutch_completion  344 non-null    float64
 4   date_egg           344 non-null    float64
 5   culmen_length      344 non-null    float64
 6   culmen_depth       344 non-null    float64
 7   flipper_length     344 non-null    float64
 8   body_mass          344 non-null    float64
 9   sex                344 non-null    float64
 10  delta_15           344 non-null    float64
 11  delta_13           344 non-null    float64
 12  comments           344 non-null    float64
 13  island_Dream       344 non-null    float64
 14  island_Torgersen   344 non-null    float64
 15  studyname_PAL0809  344 non-null    float64
 16  studyname_PAL091

## 2. Selección automatica de variables

In [4]:
# Separar las características (X) y la variable objetivo (y)
X = df.drop(['species'], axis=1)
y = df['species']

In [5]:
# Escalar las características
scaler = MinMaxScaler()
scaler2 = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = scaler2.fit_transform(X_scaled)

# Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=420)

In [6]:
# Crear el modelo de árbol de clasificación (puedes cambiarlo por otro modelo si lo prefieres)
model = DecisionTreeClassifier(random_state=42)

# Seleccionar variables con RFE
num_features_to_select = 2  # Número de características a seleccionar
rfe = RFE(model, n_features_to_select=num_features_to_select)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Entrenar el modelo con las variables seleccionadas
model.fit(X_train_rfe, y_train)

# Realizar predicciones en el conjunto de prueba
X_test_rfe = rfe.transform(X_test)
y_pred = model.predict(X_test_rfe)

# Calcular las metricas del modelo
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Precisión del modelo: {accuracy:.2f}, recall {recall:.2f}, y f1 {f1:.2f}')

Precisión del modelo: 0.91, recall 0.91, y f1 0.90


In [7]:
# Obtener las variables seleccionadas por RFE
selected_features = X.columns[rfe.support_]
print("\nVariables seleccionadas por RFE:")
print(selected_features)


Variables seleccionadas por RFE:
Index(['culmen_length', 'flipper_length'], dtype='object')


In [8]:
# pickle_out = open("cl_dt.pkl","wb")
# pickle.dump(model, pickle_out)
# pickle_out.close()

In [9]:
# Crear el modelo de árbol de clasificación (puedes cambiarlo por otro modelo si lo prefieres)
model = LogisticRegression(random_state=42)

# Seleccionar variables con RFE
num_features_to_select = 2  # Número de características a seleccionar
rfe = RFE(model, n_features_to_select=num_features_to_select)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Entrenar el modelo con las variables seleccionadas
model.fit(X_train_rfe, y_train)

# Realizar predicciones en el conjunto de prueba
X_test_rfe = rfe.transform(X_test)
y_pred = model.predict(X_test_rfe)

# Calcular las metricas del modelo
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Precisión del modelo: {accuracy:.2f}, recall {recall:.2f}, y f1 {f1:.2f}')

Precisión del modelo: 0.96, recall 0.93, y f1 0.94


In [10]:
# Obtener las variables seleccionadas por RFE
selected_features = X.columns[rfe.support_]
print("\nVariables seleccionadas por RFE:")
print(selected_features)


Variables seleccionadas por RFE:
Index(['culmen_length', 'culmen_depth'], dtype='object')


In [11]:
# pickle_out = open("cl_lr.pkl","wb")
# pickle.dump(model, pickle_out)
# pickle_out.close()

In [12]:
fig = px.scatter(df, x="culmen_length", y="flipper_length", color='species')
fig.show()

In [13]:
fig = px.scatter(df, x="culmen_length", y="culmen_depth", color='species')
fig.show()

# 3. Re hacer los modelos 

El objetivo es saltar el paso de crear multiples estandarizaciones de datos.

In [14]:
# Separar las características (X) y la variable objetivo (y)
X = df[['culmen_length','flipper_length','culmen_depth']]
y = df['species']

# Escalar las características
scaler = MinMaxScaler()
scaler2 = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = scaler2.fit_transform(X_scaled)

scaler_min_max = {'min': scaler.data_min_, 'max': scaler.data_max_}
print(scaler_min_max)

scaler_std = {'mean': scaler2.mean_, 'std': scaler2.scale_}
print(scaler_std)

{'min': array([ 32.1, 172. ,  13.1]), 'max': array([ 59.6, 231. ,  21.5])}
{'mean': array([0.43028894, 0.49119695, 0.48097776]), 'std': array([0.19775223, 0.23784136, 0.23468846])}


In [15]:
# Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=420)

In [16]:
# Modelo Decesion tree
dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
y_pred = dt.predict(X_test)

# Calcular las metricas del modelo
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Precisión del modelo: {accuracy:.2f}, recall {recall:.2f}, y f1 {f1:.2f}')

Precisión del modelo: 0.94, recall 0.94, y f1 0.93


In [17]:
pickle_out = open("cl_dt.pkl","wb")
pickle.dump(dt, pickle_out)
pickle_out.close()

In [18]:
# Modelo regresion logistica
lr = LogisticRegression(random_state=42).fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Calcular las metricas del modelo
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Precisión del modelo: {accuracy:.2f}, recall {recall:.2f}, y f1 {f1:.2f}')

Precisión del modelo: 0.99, recall 0.98, y f1 0.98


In [19]:
pickle_out = open("cl_lr.pkl","wb")
pickle.dump(lr, pickle_out)
pickle_out.close()

4. Crear funciones para hacer el predict

In [20]:
def func_transform(user_input):
    user_input_scaled = (user_input - scaler_min_max['min']) / (scaler_min_max['max'] - scaler_min_max['min'])
    user_input_scaled = (user_input_scaled - scaler_std['mean']) / scaler_std['std']
    return user_input_scaled

def salida_pred(predicted):
    if(predicted[0] == 0):
        prediction="Adelie Penguin (Pygoscelis adeliae)"
    elif(predicted[0] == 1):
        prediction="Chinstrap penguin (Pygoscelis antarctica)"
    else:
        prediction="Gentoo penguin (Pygoscelis papua)"
    return {
        'Prediccion de especie': prediction
    } 

In [21]:
user_input = [39.1, 181, 18.7]
user_input_scaled = func_transform(user_input)

# print(dt.predict([user_input_scaled]))
# print(lr.predict([user_input_scaled]))

print(salida_pred(dt.predict([user_input_scaled])))
print(salida_pred(lr.predict([user_input_scaled])))

{'Prediccion de especie': 'Adelie Penguin (Pygoscelis adeliae)'}
{'Prediccion de especie': 'Adelie Penguin (Pygoscelis adeliae)'}


In [22]:
user_input = [46.5, 192, 17.9]
user_input_scaled = func_transform(user_input)

# print(dt.predict([user_input_scaled]))
# print(lr.predict([user_input_scaled]))

print(salida_pred(dt.predict([user_input_scaled])))
print(salida_pred(lr.predict([user_input_scaled])))

{'Prediccion de especie': 'Chinstrap penguin (Pygoscelis antarctica)'}
{'Prediccion de especie': 'Chinstrap penguin (Pygoscelis antarctica)'}


In [23]:
user_input = [46.1, 211, 13.2]
user_input_scaled = func_transform(user_input)

# print(dt.predict([user_input_scaled]))
# print(lr.predict([user_input_scaled]))

print(salida_pred(dt.predict([user_input_scaled])))
print(salida_pred(lr.predict([user_input_scaled])))

{'Prediccion de especie': 'Gentoo penguin (Pygoscelis papua)'}
{'Prediccion de especie': 'Gentoo penguin (Pygoscelis papua)'}


In [24]:
print('ok_')

ok_
