SCIKIT-LEARN (sklearn)

## 1. Importaciones Basicas

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## 2. Division de Datos - train_test_split

In [None]:
# Division basica 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Division con estratificacion (mantiene proporcion de clases)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Division en tres conjuntos: train, validation, test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Parametros importantes:
# test_size: proporcion del conjunto de prueba (0.2 = 20%)
# random_state: semilla para reproducibilidad
# stratify: mantiene la distribucion de clases en train y test
# shuffle: mezcla los datos antes de dividir (True por defecto)

## 3. Escalado y Normalizacion de Datos

In [None]:
# StandardScaler: media 0 y desviacion estandar 1
# Formula: (x - media) / desviacion_estandar
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # ajusta y transforma train
X_test_scaled = scaler.transform(X_test)        # solo transforma test

# MinMaxScaler: escala entre 0 y 1
# Formula: (x - min) / (max - min)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# MinMaxScaler personalizado: escala en rango especifico
scaler = MinMaxScaler(feature_range=(0, 10))  # escala entre 0 y 10
X_train_scaled = scaler.fit_transform(X_train)

# RobustScaler: robusto a outliers, usa mediana y rango intercuartil
# Formula: (x - mediana) / IQR
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# IMPORTANTE: siempre fit solo en train, transform en train y test

## 4. Codificacion de Variables Categoricas

In [None]:
# LabelEncoder: convierte categorias a numeros (0, 1, 2, ...)
# Usar solo para variable objetivo (target) o variables ordinales
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_original = le.inverse_transform(y_encoded)  # volver a categorias originales
print(le.classes_)  # ver las clases originales

# OneHotEncoder: crea columnas binarias para cada categoria
# Usar para variables nominales (sin orden) en features
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' evita multicolinealidad
X_encoded = encoder.fit_transform(X[['categoria1', 'categoria2']])
columnas_nuevas = encoder.get_feature_names_out()  # nombres de nuevas columnas

# OneHotEncoder manteniendo DataFrame
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(df[['columna_cat']])
df_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out())

# OrdinalEncoder: convierte categorias a numeros respetando orden
# Usar cuando las categorias tienen orden (bajo < medio < alto)
encoder = OrdinalEncoder(categories=[['bajo', 'medio', 'alto']])
X_encoded = encoder.fit_transform(X[['nivel']])

# pd.get_dummies: alternativa de pandas para OneHotEncoding
df_encoded = pd.get_dummies(df, columns=['categoria'], drop_first=True)

## 5. Modelos de Regresion - Predecir valores continuos

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Regresion Lineal simple
modelo = LinearRegression()
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
print(f'Coeficientes: {modelo.coef_}')
print(f'Intercepto: {modelo.intercept_}')
print(f'R2 Score: {modelo.score(X_test, y_test)}')

# Ridge Regression: regularizacion L2, penaliza coeficientes grandes
# Util cuando hay multicolinealidad
modelo = Ridge(alpha=1.0)  # alpha controla la regularizacion
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# Lasso Regression: regularizacion L1, puede hacer coeficientes = 0
# Util para seleccion de features
modelo = Lasso(alpha=1.0)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# ElasticNet: combina L1 y L2
modelo = ElasticNet(alpha=1.0, l1_ratio=0.5)  # l1_ratio controla mezcla L1/L2
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

## 6. Regresion Logistica - Clasificacion binaria y multiclase

In [None]:
from sklearn.linear_model import LogisticRegression

# Clasificacion binaria
modelo = LogisticRegression(random_state=42)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
y_pred_proba = modelo.predict_proba(X_test)  # probabilidades de cada clase

# Clasificacion multiclase
# solver: algoritmo de optimizacion
# 'lbfgs': bueno para datasets pequeños
# 'saga': bueno para datasets grandes
# 'newton-cg': preciso pero lento
modelo = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# Con regularizacion
modelo = LogisticRegression(penalty='l2', C=1.0, random_state=42)  # C es inverso de alpha
modelo.fit(X_train, y_train)

# Parametros importantes:
# C: inverso de la fuerza de regularizacion (menor C = mas regularizacion)
# penalty: 'l1', 'l2', 'elasticnet', 'none'
# max_iter: numero maximo de iteraciones (aumentar si no converge)

## 7. Arboles de Decision

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# Clasificacion
modelo = DecisionTreeClassifier(
    max_depth=5,              # profundidad maxima del arbol
    min_samples_split=20,     # minimo de muestras para dividir un nodo
    min_samples_leaf=10,      # minimo de muestras en hoja
    random_state=42
)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# Feature importance: importancia de cada variable
importancias = modelo.feature_importances_
for i, imp in enumerate(importancias):
    print(f'Feature {i}: {imp}')

# Regresion
modelo = DecisionTreeRegressor(max_depth=5, random_state=42)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# Visualizar el arbol
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plot_tree(modelo, filled=True, feature_names=['feat1', 'feat2'])
plt.show()

## 8. Random Forest - Ensemble de arboles

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Clasificacion
modelo = RandomForestClassifier(
    n_estimators=100,         # numero de arboles
    max_depth=10,             # profundidad maxima de cada arbol
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',      # numero de features aleatorias: 'sqrt', 'log2', int, float
    random_state=42,
    n_jobs=-1                 # usar todos los cores del CPU
)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
y_pred_proba = modelo.predict_proba(X_test)

# Feature importance
importancias = pd.DataFrame({
    'feature': X_train.columns,
    'importance': modelo.feature_importances_
}).sort_values('importance', ascending=False)

# Regresion
modelo = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# Out-of-bag score: estimacion de error sin validacion cruzada
modelo = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
modelo.fit(X_train, y_train)
print(f'OOB Score: {modelo.oob_score_}')

## 9. Gradient Boosting - Ensemble secuencial

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

# Clasificacion
modelo = GradientBoostingClassifier(
    n_estimators=100,         # numero de arboles
    learning_rate=0.1,        # tasa de aprendizaje (menor = mas conservador)
    max_depth=3,              # profundidad maxima de cada arbol
    min_samples_split=20,
    min_samples_leaf=10,
    subsample=0.8,            # proporcion de muestras para cada arbol
    random_state=42
)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
y_pred_proba = modelo.predict_proba(X_test)

# Regresion
modelo = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# Feature importance
importancias = modelo.feature_importances_

## 10. Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC, SVR

# Clasificacion
modelo = SVC(
    kernel='rbf',             # 'linear', 'poly', 'rbf', 'sigmoid'
    C=1.0,                    # parametro de regularizacion
    gamma='scale',            # coeficiente del kernel: 'scale', 'auto', float
    random_state=42
)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# SVC con probabilidades
modelo = SVC(kernel='rbf', probability=True, random_state=42)
modelo.fit(X_train, y_train)
y_pred_proba = modelo.predict_proba(X_test)

# Kernel lineal (mas rapido para datos linealmente separables)
modelo = SVC(kernel='linear', C=1.0, random_state=42)
modelo.fit(X_train, y_train)

# Kernel polinomial
modelo = SVC(kernel='poly', degree=3, C=1.0, random_state=42)
modelo.fit(X_train, y_train)

# Regresion
modelo = SVR(kernel='rbf', C=1.0, epsilon=0.1)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# IMPORTANTE: SVM requiere datos escalados para funcionar bien

## 11. K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# Clasificacion
modelo = KNeighborsClassifier(
    n_neighbors=5,            # numero de vecinos
    weights='uniform',        # 'uniform' o 'distance' (ponderacion por distancia)
    metric='minkowski',       # metrica de distancia: 'euclidean', 'manhattan', 'minkowski'
    p=2                       # p=1 Manhattan, p=2 Euclidean
)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
y_pred_proba = modelo.predict_proba(X_test)

# KNN con ponderacion por distancia
modelo = KNeighborsClassifier(n_neighbors=5, weights='distance')
modelo.fit(X_train, y_train)

# Regresion
modelo = KNeighborsRegressor(n_neighbors=5, weights='uniform')
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# IMPORTANTE: KNN requiere datos escalados para funcionar bien

## 12. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# GaussianNB: para features continuas con distribucion normal
modelo = GaussianNB()
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
y_pred_proba = modelo.predict_proba(X_test)

# MultinomialNB: para features de conteo (texto, frecuencias)
# Requiere valores no negativos
modelo = MultinomialNB(alpha=1.0)  # alpha: parametro de suavizado
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# BernoulliNB: para features binarias (0/1)
modelo = BernoulliNB(alpha=1.0)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

## 13. Clustering - Aprendizaje no supervisado

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score

# KMeans: particiona datos en K clusters
modelo = KMeans(
    n_clusters=3,             # numero de clusters
    init='k-means++',         # metodo de inicializacion
    n_init=10,                # numero de veces que se ejecuta con diferentes centroides
    max_iter=300,
    random_state=42
)
clusters = modelo.fit_predict(X)
centroides = modelo.cluster_centers_

# Evaluar calidad del clustering
inercia = modelo.inertia_  # suma de distancias al cuadrado al centroide mas cercano
silhouette = silhouette_score(X, clusters)  # entre -1 y 1, mayor es mejor

# Metodo del codo para encontrar K optimo
inertias = []
for k in range(1, 11):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X)
    inertias.append(km.inertia_)

# DBSCAN: clustering basado en densidad, no requiere especificar K
modelo = DBSCAN(
    eps=0.5,                  # distancia maxima entre puntos del mismo cluster
    min_samples=5             # minimo de puntos para formar un cluster
)
clusters = modelo.fit_predict(X)
# cluster -1 son outliers

# Hierarchical Clustering
modelo = AgglomerativeClustering(
    n_clusters=3,             # numero de clusters
    linkage='ward'            # 'ward', 'complete', 'average', 'single'
)
clusters = modelo.fit_predict(X)

## 14. Reduccion de Dimensionalidad

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# PCA: Principal Component Analysis
# Reduce dimensiones manteniendo maxima varianza
pca = PCA(n_components=2)  # reducir a 2 componentes
X_pca = pca.fit_transform(X)

# Ver varianza explicada por cada componente
print(f'Varianza explicada: {pca.explained_variance_ratio_}')
print(f'Varianza acumulada: {pca.explained_variance_ratio_.cumsum()}')

# PCA manteniendo X% de varianza
pca = PCA(n_components=0.95)  # mantener 95% de varianza
X_pca = pca.fit_transform(X)
print(f'Numero de componentes: {pca.n_components_}')

# Cargar componentes principales (loadings)
loadings = pca.components_

# t-SNE: para visualizacion (no lineal)
# Mas lento que PCA pero mejor para visualizar clusters
tsne = TSNE(
    n_components=2,
    perplexity=30,            # balance entre local y global
    n_iter=1000,
    random_state=42
)
X_tsne = tsne.fit_transform(X)

# IMPORTANTE: aplicar PCA despues de escalar los datos

## 15. Metricas de Evaluacion - Clasificacion

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

# Metricas basicas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # average: 'binary', 'weighted', 'macro', 'micro'
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Matriz de confusion
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Formato: [[TN, FP],
#           [FN, TP]]

# Reporte de clasificacion completo
print(classification_report(y_test, y_pred))

# ROC AUC Score (requiere probabilidades)
# Para clasificacion binaria
auc = roc_auc_score(y_test, y_pred_proba[:, 1])

# Para clasificacion multiclase
auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')

# Curva ROC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])

# Visualizar matriz de confusion
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 16. Metricas de Evaluacion - Regresion

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Root Mean Squared Error

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# R2 Score (coeficiente de determinacion)
# 1 = perfecto, 0 = modelo no mejor que la media
r2 = r2_score(y_test, y_pred)

# Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')
print(f'MAPE: {mape}')

# Residuales
residuales = y_test - y_pred

# Grafico de residuales
plt.scatter(y_pred, residuales)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.show()

## 17. Validacion Cruzada

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate, KFold, StratifiedKFold

# Cross validation simple (5 folds por defecto)
scores = cross_val_score(modelo, X, y, cv=5, scoring='accuracy')
print(f'Scores: {scores}')
print(f'Media: {scores.mean()}')
print(f'Desviacion: {scores.std()}')

# Cross validation con KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(modelo, X, y, cv=kfold, scoring='accuracy')

# StratifiedKFold: mantiene proporcion de clases en cada fold
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(modelo, X, y, cv=skfold, scoring='accuracy')

# Cross validation con multiples metricas
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
scores = cross_validate(modelo, X, y, cv=5, scoring=scoring, return_train_score=True)
print(f'Test Accuracy: {scores["test_accuracy"].mean()}')
print(f'Test Precision: {scores["test_precision_weighted"].mean()}')

# Scoring options para clasificacion:
# 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
# 'precision_weighted', 'recall_weighted', 'f1_weighted'

# Scoring options para regresion:
# 'neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'

## 18. Busqueda de Hiperparametros - Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definir grid de parametros
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,                     # numero de folds
    scoring='accuracy',
    n_jobs=-1,                # usar todos los cores
    verbose=1,                # mostrar progreso
    return_train_score=True
)
grid_search.fit(X_train, y_train)

# Mejores parametros y score
print(f'Mejores parametros: {grid_search.best_params_}')
print(f'Mejor score: {grid_search.best_score_}')

# Mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Resultados detallados
resultados = pd.DataFrame(grid_search.cv_results_)
print(resultados[['params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False))

## 19. Busqueda de Hiperparametros - Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Definir distribuciones de parametros
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(5, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': uniform(0.1, 0.9)
}

# Randomized Search (mas rapido que Grid Search)
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=100,               # numero de combinaciones a probar
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search.fit(X_train, y_train)

# Mejores parametros y score
print(f'Mejores parametros: {random_search.best_params_}')
print(f'Mejor score: {random_search.best_score_}')

# Mejor modelo
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

## 20. Pipelines - Concatenar transformaciones y modelos

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# Pipeline simple
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('modelo', LogisticRegression(random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Pipeline con make_pipeline (nombres automaticos)
pipeline = make_pipeline(
    StandardScaler(),
    PCA(n_components=10),
    RandomForestClassifier(random_state=42)
)
pipeline.fit(X_train, y_train)

# ColumnTransformer: aplicar diferentes transformaciones a diferentes columnas
columnas_numericas = ['edad', 'salario', 'experiencia']
columnas_categoricas = ['ciudad', 'departamento']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columnas_numericas),
        ('cat', OneHotEncoder(drop='first'), columnas_categoricas)
    ])

# Pipeline completo con ColumnTransformer
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Grid Search con Pipeline
param_grid = {
    'preprocessor__num__with_mean': [True, False],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, None]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

## 21. Seleccion de Features

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, chi2
from sklearn.feature_selection import RFE, SelectFromModel

# SelectKBest: selecciona K mejores features
# Para clasificacion
selector = SelectKBest(score_func=f_classif, k=10)  # seleccionar 10 mejores
X_selected = selector.fit_transform(X_train, y_train)
selected_features = selector.get_support(indices=True)

# Para regresion
selector = SelectKBest(score_func=f_regression, k=10)
X_selected = selector.fit_transform(X_train, y_train)

# Chi2: para features no negativas
selector = SelectKBest(score_func=chi2, k=10)
X_selected = selector.fit_transform(X_train, y_train)

# RFE: Recursive Feature Elimination
# Elimina recursivamente las peores features
selector = RFE(
    estimator=RandomForestClassifier(random_state=42),
    n_features_to_select=10,
    step=1                    # numero de features a eliminar en cada paso
)
selector.fit(X_train, y_train)
X_selected = selector.transform(X_train)
selected_features = selector.support_
ranking = selector.ranking_

# SelectFromModel: selecciona basado en importancia del modelo
selector = SelectFromModel(
    estimator=RandomForestClassifier(random_state=42),
    threshold='median'        # umbral de importancia: 'mean', 'median', float
)
selector.fit(X_train, y_train)
X_selected = selector.transform(X_train)
selected_features = selector.get_support()

## 22. Manejo de Datos Desbalanceados

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Class weights: penalizar mas errores en clase minoritaria
# Opcion 1: calcular manualmente
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Opcion 2: usar 'balanced' automaticamente
modelo = LogisticRegression(class_weight='balanced', random_state=42)
modelo.fit(X_train, y_train)

# Para Random Forest
modelo = RandomForestClassifier(class_weight='balanced', random_state=42)
modelo.fit(X_train, y_train)

# Para SVM
modelo = SVC(class_weight='balanced', random_state=42)
modelo.fit(X_train, y_train)

# Ajustar threshold de decision
y_pred_proba = modelo.predict_proba(X_test)[:, 1]
threshold = 0.3  # reducir threshold para detectar mas positivos
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

## 23. Guardar y Cargar Modelos

In [None]:
import joblib
import pickle

# Guardar modelo con joblib (recomendado)
joblib.dump(modelo, 'modelo.pkl')

# Cargar modelo con joblib
modelo_cargado = joblib.load('modelo.pkl')
y_pred = modelo_cargado.predict(X_test)

# Guardar con pickle
with open('modelo.pkl', 'wb') as file:
    pickle.dump(modelo, file)

# Cargar con pickle
with open('modelo.pkl', 'rb') as file:
    modelo_cargado = pickle.load(file)

# Guardar pipeline completo
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
pipeline.fit(X_train, y_train)
joblib.dump(pipeline, 'pipeline_completo.pkl')

# Cargar pipeline
pipeline_cargado = joblib.load('pipeline_completo.pkl')
y_pred = pipeline_cargado.predict(X_test)

## 24. Metodos Ensemble Adicionales

In [None]:
from sklearn.ensemble import VotingClassifier, VotingRegressor, BaggingClassifier, AdaBoostClassifier

# Voting Classifier: combina multiples modelos
modelo1 = LogisticRegression(random_state=42)
modelo2 = RandomForestClassifier(random_state=42)
modelo3 = SVC(probability=True, random_state=42)

# Voting hard: voto por mayoria
voting = VotingClassifier(
    estimators=[('lr', modelo1), ('rf', modelo2), ('svc', modelo3)],
    voting='hard'
)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)

# Voting soft: promedio de probabilidades
voting = VotingClassifier(
    estimators=[('lr', modelo1), ('rf', modelo2), ('svc', modelo3)],
    voting='soft',
    weights=[1, 2, 1]         # pesos para cada modelo
)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)

# Bagging: multiples modelos del mismo tipo con muestras aleatorias
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,          # proporcion de muestras para cada modelo
    max_features=0.8,         # proporcion de features para cada modelo
    random_state=42
)
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

# AdaBoost: ensemble secuencial con pesos adaptativos
adaboost = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0,
    random_state=42
)
adaboost.fit(X_train, y_train)
y_pred = adaboost.predict(X_test)

## 25. Feature Engineering con sklearn

In [None]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.preprocessing import Binarizer, KBinsDiscretizer

# PolynomialFeatures: crear features polinomiales
# Ejemplo: [a, b] -> [1, a, b, a^2, ab, b^2]
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
feature_names = poly.get_feature_names_out()

# Binarizer: convertir a binario segun umbral
binarizer = Binarizer(threshold=0.5)
X_binary = binarizer.transform(X)

# KBinsDiscretizer: discretizar variables continuas en bins
discretizer = KBinsDiscretizer(
    n_bins=5,                 # numero de bins
    encode='ordinal',         # 'ordinal', 'onehot', 'onehot-dense'
    strategy='quantile'       # 'uniform', 'quantile', 'kmeans'
)
X_discretized = discretizer.fit_transform(X)

# FunctionTransformer: aplicar funcion personalizada
def log_transform(X):
    return np.log1p(X)  # log(1 + X)

transformer = FunctionTransformer(log_transform)
X_log = transformer.fit_transform(X)

# Interacciones personalizadas en Pipeline
def crear_interacciones(X):
    df = pd.DataFrame(X)
    df['interaccion_1_2'] = df[0] * df[1]
    df['ratio_1_2'] = df[0] / (df[1] + 1)
    return df

transformer = FunctionTransformer(crear_interacciones)
pipeline = make_pipeline(transformer, StandardScaler(), LogisticRegression())

## 26. Manejo de Valores Faltantes

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer

# SimpleImputer: imputacion simple
# Estrategias: 'mean', 'median', 'most_frequent', 'constant'
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_train)

# Imputar con mediana
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_train)

# Imputar con moda (para categoricas)
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X_train)

# Imputar con constante
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_imputed = imputer.fit_transform(X_train)

# KNNImputer: imputacion basada en K vecinos mas cercanos
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X_train)

# Uso en Pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])

## 27. Inspeccion y Analisis de Modelos

In [None]:
from sklearn.inspection import permutation_importance, partial_dependence, PartialDependenceDisplay

# Permutation Importance: importancia por permutacion
# Mas preciso que feature_importances_ para Random Forest
resultado = permutation_importance(modelo, X_test, y_test, n_repeats=10, random_state=42)
importancias = resultado.importances_mean

for i, imp in enumerate(importancias):
    print(f'Feature {i}: {imp}')

# Partial Dependence: efecto de una feature en prediccion
# Muestra como cambia la prediccion al variar una feature
from sklearn.inspection import PartialDependenceDisplay
features = [0, 1, (0, 1)]  # features individuales y par de features
PartialDependenceDisplay.from_estimator(modelo, X, features)

# Learning curves: diagnosticar overfitting/underfitting
from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    modelo, X, y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5,
    scoring='accuracy'
)

train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)

plt.plot(train_sizes, train_mean, label='Training score')
plt.plot(train_sizes, val_mean, label='Validation score')
plt.xlabel('Training size')
plt.ylabel('Score')
plt.legend()
plt.show()

# Validation curves: evaluar efecto de un hiperparametro
from sklearn.model_selection import validation_curve

param_range = [10, 50, 100, 200, 500]
train_scores, val_scores = validation_curve(
    RandomForestClassifier(random_state=42), X, y,
    param_name='n_estimators',
    param_range=param_range,
    cv=5,
    scoring='accuracy'
)

## 28. Calibracion de Probabilidades

In [None]:
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

# Calibrar probabilidades de un modelo
# Util cuando las probabilidades predichas no son confiables
modelo_base = RandomForestClassifier(random_state=42)
modelo_calibrado = CalibratedClassifierCV(modelo_base, method='sigmoid', cv=5)
modelo_calibrado.fit(X_train, y_train)
y_pred_proba = modelo_calibrado.predict_proba(X_test)

# method='sigmoid': Platt scaling (bueno para modelos tipo SVM)
# method='isotonic': regresion isotonica (mas flexible, requiere mas datos)

# Evaluar calibracion
prob_true, prob_pred = calibration_curve(y_test, y_pred_proba[:, 1], n_bins=10)

plt.plot([0, 1], [0, 1], linestyle='--', label='Perfectly calibrated')
plt.plot(prob_pred, prob_true, marker='o', label='Model')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.legend()
plt.show()

## 29. Deteccion de Outliers

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

# Isolation Forest
iso_forest = IsolationForest(
    contamination=0.1,        # proporcion esperada de outliers
    random_state=42
)
outliers = iso_forest.fit_predict(X)  # -1 para outliers, 1 para inliers

# Local Outlier Factor
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.1
)
outliers = lof.fit_predict(X)

# Elliptic Envelope: asume distribucion gaussiana
envelope = EllipticEnvelope(contamination=0.1, random_state=42)
outliers = envelope.fit_predict(X)

# Usar outlier detection para limpiar datos
mask_inliers = outliers == 1
X_clean = X[mask_inliers]
y_clean = y[mask_inliers]

## 30. Tips y Mejores Practicas

In [None]:
# 1. SIEMPRE escalar datos para SVM, KNN, regresion regularizada
# NO es necesario para arboles y Random Forest

# 2. SIEMPRE usar random_state para reproducibilidad
modelo = RandomForestClassifier(random_state=42)

# 3. SIEMPRE fit scaler solo en train, transform en train y test
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Usar Pipeline para evitar data leakage
pipeline = make_pipeline(StandardScaler(), LogisticRegression())
pipeline.fit(X_train, y_train)  # scaling se aplica solo a train

# 5. Usar stratify en train_test_split para datos desbalanceados
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# 6. Cross validation es mejor que una sola division train/test
scores = cross_val_score(modelo, X, y, cv=5)

# 7. Para datasets grandes, usar n_jobs=-1 para paralelizar
modelo = RandomForestClassifier(n_jobs=-1)

# 8. GridSearch puede ser muy lento, considerar RandomizedSearch
# GridSearch: prueba todas las combinaciones
# RandomizedSearch: prueba N combinaciones aleatorias

# 9. Verificar distribucion de clases en clasificacion
print(pd.Series(y).value_counts())

# 10. Para modelos interpretables: Regresion Logistica, Decision Trees
# Para mejor performance: Random Forest, Gradient Boosting, XGBoost

# 11. Siempre verificar el rango de features antes de modelar
print(X.describe())

# 12. Documentar los hiperparametros elegidos y por que
# Usar nombres descriptivos para pipelines y modelos

# 13. Guardar el modelo final con su pipeline completo
joblib.dump(pipeline, 'modelo_completo.pkl')

# 14. Monitorear metricas en train y test para detectar overfitting
train_score = modelo.score(X_train, y_train)
test_score = modelo.score(X_test, y_test)
print(f'Train: {train_score}, Test: {test_score}')

# 15. Para datos con muchas features, considerar PCA o feature selection