# Proyecto de Clasificación: Predicción de Pases (Deep Optimization)

Búsqueda exhaustiva de la mejor configuración posible (>83%).
1. **Análisis Exploratorio y Limpieza**.
2. **Feature Engineering Robusto**: `Has_Spent`, `Ticket_Type` (Unknown), `FoodShare`, `Under21`.
3. **Optimización Profunda**: GridSearch Masivo sobre Gradient Boosting.
4. **Validación Completa**: ROC, Matriz, Errores.
5. **Submission**.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Modelos y Métricas
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score

pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
sns.set_palette("viridis")

## 1. Carga y EDA

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

if 'AvgTime' in train_df.columns:
    train_df = train_df.drop('AvgTime', axis=1)
    test_df = test_df.drop('AvgTime', axis=1)

def clean_data(df):
    df = df.copy()
    df['VIP'] = df['VIP'].fillna(False).astype(int)
    df['Age'] = df['Age'].replace(0, np.nan)
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df[['Food', 'Drinks']] = df[['Food', 'Drinks']].fillna(0)
    return df

train_df = clean_data(train_df)
test_df = clean_data(test_df)

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

## 2. Feature Engineering (Todas las armas)

In [None]:
def feature_engineering(df):
    df_eng = df.copy()
    
    # Ticket Parsing (Lógica Robusta)
    split = df_eng['TicketInfo'].astype(str).str.split('/', expand=True)
    if split.shape[1] >= 1: df_eng['Ticket_Zone'] = split[0]
    if split.shape[1] >= 3: 
        df_eng['Ticket_Type'] = split[2]
    else:
        df_eng['Ticket_Type'] = 'Unknown'
        
    # Gastos y Features Nuevas
    df_eng['TotalSpend'] = df_eng['Food'] + df_eng['Drinks']
    df_eng['Has_Spent'] = (df_eng['TotalSpend'] > 0).astype(int)
    
    # FoodShare: Ratio Comida
    df_eng['FoodShare'] = df_eng['Food'] / (df_eng['TotalSpend'] + 0.01)
    
    # Under 21
    df_eng['Is_Under21'] = (df_eng['Age'] < 21).astype(int)
    
    # Texto
    df_eng['Opinion_Len'] = df_eng['Opinion'].fillna('').astype(str).apply(len)
    
    drop_cols = ['Id', 'TicketInfo', 'Opinion', 'Concert']
    df_eng = df_eng.drop(columns=[c for c in drop_cols if c in df_eng.columns])
    return df_eng

X = feature_engineering(train_df)
X_test_final = feature_engineering(test_df)

y = X['FreePass']
X = X.drop('FreePass', axis=1)

# Encoding
X = pd.get_dummies(X)
X_test_final = pd.get_dummies(X_test_final)
X, X_test_final = X.align(X_test_final, join='left', axis=1, fill_value=0)

## 3. Optimización Profunda con GridSearch
Buscamos la mejor combinación de parámetros para Gradient Boosting.

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', GradientBoostingClassifier(random_state=42))
])

# GRID DE BÚSQUEDA MASIVO
param_grid = {
    'model__n_estimators': [100, 200, 400],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__max_depth': [3, 4, 5, 6],
    'model__subsample': [0.8, 0.9, 1.0],
    'model__min_samples_split': [2, 5]
}

print("Iniciando Búsqueda Profunda (esto tomará unos minutos)...")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("\n--- MEJOR CONFIGURACIÓN ENCONTRADA ---")
print(grid_search.best_params_)
print(f"Best CV Score: {grid_search.best_score_:.4f}")

## 4. Validación Final

In [None]:
y_pred = best_model.predict(X_val)
y_probs = best_model.predict_proba(X_val)[:, 1]

print(f"ACCURACY EN VALIDACIÓN (OPTIMIZADO): {accuracy_score(y_val, y_pred):.4f}")

# Reporte
print("\nReporte de Clasificación:")
print(classification_report(y_val, y_pred))

# Gráficos
plt.figure(figsize=(12, 5))

# ROC
auc = roc_auc_score(y_val, y_probs)
fpr, tpr, _ = roc_curve(y_val, y_probs)
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f'Best GBM (AUC={auc:.4f})', color='darkorange', lw=3)
plt.plot([0, 1], [0, 1], 'k--')
plt.title('Curva ROC')
plt.legend()

# Matriz
plt.subplot(1, 2, 2)
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Matriz de Confusión')
plt.tight_layout()
plt.show()

## 5. Submission

In [None]:
best_model.fit(X, y)
final_preds = best_model.predict(X_test_final)

sub = pd.DataFrame({'Id': test_df['Id'], 'FreePass': final_preds})
sub['FreePass'] = sub['FreePass'].astype(bool)
sub.to_csv('submission_optimized_final.csv', index=False)
print("Archivo 'submission_optimized_final.csv' generado correctamente.")