In [4]:
import pandas as pd
from sklearn.base import clone
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from itertools import product
from sklearn.neighbors import LocalOutlierFactor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
import seaborn as sns

columns = [
    'Base year',
    'Subscriber serial number',
    'attempt code',
    'gender code',
    'Age code (5 years increments)',
    'Height (in 5cm increments)',
    'Weight (in 5 kg units)',
    'Waist circumference',
    'Vision (left)',
    'Vision (right)',
    'Hearing (left)',
    'Hearing (right)',
    'systolic blood pressure',
    'diastolic blood pressure',
    'Pre-meal blood sugar (fasting blood sugar)',
    'total cholesterol',
    'triglycerides',
    'Cholesterol (HDL)',
    'Cholesterol (LDL)',
    'hemoglobin',
    'urine protein',
    'Serum creatinine',
    'Liver function test (AST)',
    'Liver function test (ALT)',
    'Gamma GT',
    'Smoking status',
    'Drinking status',
    'Whether to undergo oral examination',
    'Presence of dental caries',
    'Whether or not the defect is healed',
    'Presence or absence of tooth wear',
    'Third molars (wisdom teeth) or more',
    'tartar'
]
df_original = pd.read_csv('Smoking/data-kor.CSV', encoding='cp949', names=columns, skiprows=1)

edades = [
    [1, '0-1'],
    [2, '1-4'],
    [3, '5-9'],
    [4, '10-14'],
    [5, '15-19'],
    [6, '20-24'],
    [7, '25-29'],
    [8, '30-34'],
    [9, '35-39'],
    [10, '40-44'],
    [11, '45-49'],
    [12, '50-54'],
    [13, '55-59'],
    [14, '60-64'],
    [15, '65-69'],
    [16, '70-74'],
    [17, '75-79'],
    [18, '80-mas']
]

edad = pd.DataFrame(edades, columns=['Age code (5 years increments)', 'Age'])
df_original = df_original.merge(edad, on='Age code (5 years increments)', how='left')
df_original.drop('Age code (5 years increments)', axis=1, inplace=True)

df_original_Base = df_original.dropna(subset=['Drinking status'])

# Para el DataFrame original
X_original = df_original_Base.drop('Drinking status', axis=1)
X_original['stratify_col'] = df_original_Base['Drinking status'].astype(str) + '_' + df_original_Base['gender code'].astype(str)
Y_original = df_original_Base['Drinking status']

# Para el DataFrame original
X_train_orig, X_test_orig, Y_train_orig, Y_test_orig = train_test_split(
    X_original, Y_original, test_size=0.2, stratify=X_original['stratify_col'], random_state=42)

# Eliminar la columna 'stratify_col' después de dividir los datos
X_train_orig = X_train_orig.drop('stratify_col', axis=1)
X_test_orig = X_test_orig.drop('stratify_col', axis=1)

col_categoricas = ['gender code', 
                    'Hearing (left)', 
                    'Hearing (right)', 
                    'urine protein', 
                    'Smoking status', 
                    'Whether to undergo oral examination']
col_numericas = ['Height (in 5cm increments)',
                          'Weight (in 5 kg units)', 
                          'Waist circumference',
                          'Vision (left)', 
                          'Vision (right)', 
                          'systolic blood pressure', 
                          'diastolic blood pressure', 
                          'Pre-meal blood sugar (fasting blood sugar)', 
                          'hemoglobin', 
                          'Serum creatinine', 
                          'Liver function test (AST)', 
                          'Liver function test (ALT)', 
                          'Gamma GT']


# Calcula la media y la desviación estándar para cada columna
mean = X_train_orig[col_numericas].mean()
std = X_train_orig[col_numericas].std()

# Define los límites para los outliers
lower_bound = mean - 3 * std
upper_bound = mean + 3 * std

# Identifica los outliers
outliers = (X_train_orig[col_numericas] < lower_bound) | (X_train_orig[col_numericas] > upper_bound)
mask_no_outliers = (outliers.sum(axis=1) == 0)
X_train_orig_sin_outliers_std = X_train_orig[mask_no_outliers]
Y_train_orig_sin_outliers_std = Y_train_orig[mask_no_outliers]

print('Tamaño del dataset actual de entrenamiento: ', X_train_orig_sin_outliers_std.shape)

Tamaño del dataset actual de entrenamiento:  (743024, 32)


In [5]:
transformer = ColumnTransformer(
                transformers=[
                    ('age', Pipeline(steps=[('ordinal', OrdinalEncoder(categories = [[i[1] for i in edades]])), ('imputer', SimpleImputer(strategy='most_frequent'))]), ['Age']),
                    ('Categoricas', SimpleImputer(strategy='most_frequent'), col_categoricas),
                    ('numericas', IterativeImputer(initial_strategy='constant'), col_numericas)
                ], remainder='drop')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
pipeline = Pipeline(steps=[('preprocessor', transformer), ('model', LogisticRegression(max_iter=1000))])
pipeline_copy = clone(pipeline)

scores = cross_val_score(pipeline_copy, X_train_orig_sin_outliers_std, Y_train_orig_sin_outliers_std, scoring='accuracy', cv=cv, n_jobs=-1)

print(f'Resultados para datos con {pipeline.steps[1][1].__class__.__name__}:')
print(f'Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})')

KeyboardInterrupt: 

In [6]:
copy_transformed = clone(transformer)
X_train_sin_out_imputed = copy_transformed.fit_transform(X_train_orig_sin_outliers_std)
columnas = ['age'] + col_categoricas + col_numericas
X_train_sin_out_imputed = pd.DataFrame(X_train_sin_out_imputed, columns=columnas)

print(X_train_orig_sin_outliers_std.head(5))
print(X_train_sin_out_imputed.head(5))

        Base year  Subscriber serial number  attempt code  gender code  \
9314         2021                   4693858            41            1   
648392       2021                   2904270            42            1   
119261       2021                   4944774            28            2   
175267       2021                   1751785            47            1   
976114       2021                   3356122            27            1   

        Height (in 5cm increments)  Weight (in 5 kg units)  \
9314                           165                      70   
648392                         165                      70   
119261                         150                      65   
175267                         175                      85   
976114                         155                      65   

        Waist circumference  Vision (left)  Vision (right)  Hearing (left)  \
9314                   87.0            0.9             1.2             1.0   
648392                 86.

In [10]:
# importamos los scaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

scaler = StandardScaler()
escaladores = [StandardScaler(), MinMaxScaler(), RobustScaler()]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

for i, x in enumerate(escaladores):

    scaler = ColumnTransformer(
                transformers=[
                    ('scaler',  X_train_orig_sin_outliers_std, col_numericas),
                ], remainder='passthrough')
    pipeline = Pipeline(steps=[('scaler_x', scaler), ('model', LogisticRegression(max_iter=1000))])
    scores = cross_val_score(pipeline, X_train_orig_sin_outliers_std, Y_train_orig_sin_outliers_std, scoring='accuracy', cv=cv, n_jobs=-1)

    print(f'Resultados para datos con {pipeline.steps[1][1].__class__.__name__}:')
    print(f'Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})')

KeyboardInterrupt: 

In [9]:


# revisamos datos nulos por cada fila y mostramos las filas que tienen más de un 50% datos nulos
df_original_aaaa = df_original_Base.isnull().sum(axis=1)
df_original_Base[df_original_aaaa.isnull().sum(axis=1) > 5]


0         7
1         9
2         5
3         9
4         9
         ..
999995    5
999996    5
999997    9
999998    5
999999    9
Length: 999820, dtype: int64