In [6]:
import pandas as pd
import numpy as np
import textblob as tb
randstate=np.random.randint(0,1000)


def get_text_features(df):
    """
    Extracts deterministic NLP features from the 'Opinion' column.
    """
    # 1. Clean Text
    # fillna("") ensures we can run string methods even on missing data
    clean_op = (
        df["Opinion"]
        .fillna("")
        .str.lower()
        .str.replace(r"[^a-z\s!?.]", "", regex=True)
        .str.strip()
    )
    
    # 2. Structural Features
    df["op_word_count"] = clean_op.str.split().apply(len)
    
    # 3. Intensity Features
    df["exclam_count"] = clean_op.str.count("!")
    df["qmark_count"] = clean_op.str.count(r"\?")
    
    # 4. Sentiment Features (TextBlob)
    # We use a simple apply; for huge datasets, consider swifter or pandarallel
    df["tb_polarity"] = clean_op.apply(
        lambda x: tb.TextBlob(x).sentiment.polarity if x else 0
    )
    
    # 5. Key Counts (Positive/Negative/Specific words)
    negations = {"not", "never", "no", "nothing", "none"}
    df["negation_count"] = clean_op.apply(lambda x: sum(w in negations for w in x.split()))

    negative_words = {"worst", "disaster", "refund", "ruined", "terrible"}
    df["neg_word_count"] = clean_op.apply(lambda x: sum(w in negative_words for w in x.split()))

    # Specific Keyword Flags (High correlation anchors)
    keywords = ["refund", "worst", "magic", "loved"]
    for kw in keywords:
        df[f"kw_{kw}"] = clean_op.str.contains(rf"\b{kw}\b", regex=True).astype(int)
        
    return df

def split_ticket(ticket):
    try:
        parts = str(ticket).split('/')
        return parts[0], parts[-1] # Returns (TicketType, TicketClass)
    except:
        return 'Unknown', 'Unknown'

def preprocess_csv_improved(path, save=False, filename=""):
    data = pd.read_csv(path)
    
    # --- 1. NLP & Extraction (Keep your good logic) ---
    # (Assuming get_text_features and split_ticket are defined as before)
    data = get_text_features(data)
    
    data[['Ticket_Type', 'Ticket_Class']] = data['TicketInfo'].apply(lambda x: pd.Series(split_ticket(x)))

    # --- 2. Handling Missing Values Preserving Information ---
    # CATEGORICAL: Fill with "Missing" so the model sees it as a specific category
    cat_cols = ['Concert', 'Ticket_Type', 'Ticket_Class', 'PreferedAlbum', 'Vinyl', 'VIP']
    for col in cat_cols:
        if col in data.columns:
            data[col] = data[col].fillna("Missing")

    # NUMERICAL: Do NOT impute Median. Leave as NaN.
    # HistGradientBoosting handles NaNs automatically.
    
    # --- 3. Encoding ---
    # We drop raw text columns
    cols_to_drop = [ 'Opinion', 'TicketInfo']
    data = data.drop(columns=cols_to_drop, errors='ignore')
    
    # One-Hot Encoding for categories
    df_encoded = pd.get_dummies(data, columns=cat_cols, drop_first=True)
    
    if save:
        df_encoded.to_csv(filename, index=False)

    return df_encoded

In [23]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import os
path=os.getcwd()
# 1. Load Data
# Note: We do NOT need to pass 'stats' anymore because we aren't calculating Medians
train_df = preprocess_csv_improved(path+'/train.csv')
test_df = preprocess_csv_improved(path+'/test.csv')

# 2. Align Columns (Critical Fix)
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

# 3. Prepare X/Y
# Drop artifacts and targets
drop_cols = ["Id", "FreePass", "AvgTime", "Opinion_raw", "Opinion_clean"] 
X = train_df.drop(columns=drop_cols, errors='ignore')
Y = train_df["FreePass"]

X_test_submission = test_df.drop(columns=drop_cols, errors='ignore')

# 4. Model: HistGradientBoosting
# Native support for missing values (nan_mode="fraction" or "most_frequent")
hgb = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=300,        # Equivalent to n_estimators
    max_depth=5,         # Constrain depth
    l2_regularization=1.0, # Helps preventing overfitting
    random_state=randstate,
    early_stopping=True  # Automatically stops if validation score stops improving
)

# 5. Robust Evaluation (Cross-Validation)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=randstate)
scores = cross_val_score(hgb, X, Y, cv=cv, scoring='roc_auc')

print(f"CV AUC: {scores.mean():.4f} +/- {scores.std():.4f}")

# 6. Final Training & Prediction
hgb.fit(X, Y)

# Predict Probabilities
final_probs = hgb.predict_proba(X_test_submission)[:, 1]

# Optimize Threshold (Quick check on Training data - bias warning, but useful for hard cuts)
# Ideally, you do this inside a CV loop, but this is a good approximation
from sklearn.metrics import accuracy_score
train_probs = hgb.predict_proba(X)[:, 1]
best_tr = 0.5
best_acc = 0
for tr in np.arange(0.3, 0.7, 0.01):
    acc = accuracy_score(Y, train_probs >= tr)
    if acc > best_acc:
        best_acc = acc
        best_tr = tr

print(f"Best Threshold on Training: {best_tr}")

# Export
final_preds = (final_probs >= best_tr)
output = pd.DataFrame({'Id': test_df['Id'], 'FreePass': final_preds})
output.to_csv(path + "/results/submission_hist_gradient_boost_JAN12.csv", index=False)

CV AUC: 0.8988 +/- 0.0085
Best Threshold on Training: 0.46000000000000013


In [17]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# 1. Load Data
# Note: We do NOT need to pass 'stats' anymore because we aren't calculating Medians
train_df = preprocess_csv_improved(path+'/train.csv')
test_df = preprocess_csv_improved(path+'/test.csv')

# 2. Align Columns (Critical Fix)
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

# 3. Prepare X/Y
# Drop artifacts and targets
drop_cols = ["Id", "FreePass", "AvgTime", "Opinion_raw", "Opinion_clean"] 
X = train_df.drop(columns=drop_cols, errors='ignore')
Y = train_df["FreePass"]

X_test_submission = test_df.drop(columns=drop_cols, errors='ignore')

# 4. Model: HistGradientBoosting
# Native support for missing values (nan_mode="fraction" or "most_frequent")
hgb = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=100,        # Equivalent to n_estimators
    max_depth=5,         # Constrain depth
    l2_regularization=1.0, # Helps preventing overfitting
    random_state=randstate,
    early_stopping=True  # Automatically stops if validation score stops improving
)

# 5. Robust Evaluation (Cross-Validation)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=randstate)
scores = cross_val_score(hgb, X, Y, cv=cv, scoring='roc_auc')

print(f"CV AUC: {scores.mean():.4f} +/- {scores.std():.4f}")

# 6. Final Training & Prediction
hgb.fit(X, Y)

# Predict Probabilities
final_probs = hgb.predict_proba(X_test_submission)[:, 1]

# Optimize Threshold (Quick check on Training data - bias warning, but useful for hard cuts)
# Ideally, you do this inside a CV loop, but this is a good approximation
from sklearn.metrics import accuracy_score
train_probs = hgb.predict_proba(X)[:, 1]
best_tr = 0.5
best_acc = 0
for tr in np.arange(0.3, 0.7, 0.01):
    acc = accuracy_score(Y, train_probs >= tr)
    if acc > best_acc:
        best_acc = acc
        best_tr = tr

print(f"Best Threshold on Training: {best_tr}")

# Export
final_preds = (final_probs >= best_tr)
output = pd.DataFrame({'Id': test_df['Id'], 'FreePass': final_preds})
output.to_csv(path + "/results/submission_hist_gradient_boost_JAN12.csv", index=False)

CV AUC: 0.8974 +/- 0.0082
Best Threshold on Training: 0.48000000000000015


In [19]:
from sklearn.ensemble import VotingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer  # <--- NEW IMPORT

# 1. Define Model 1: Linear Model (Now with Imputation)
clf1 = make_pipeline(
    SimpleImputer(strategy='median'),  # <--- CRITICAL FIX: Fills NaNs for this model only
    StandardScaler(), 
    LogisticRegression(
        penalty='l1', 
        solver='liblinear', 
        C=0.1,
        class_weight='balanced',
        random_state=randstate
    )
)

# 2. Define Model 2: Tree Model (Handles NaNs natively, no changes needed)
clf2 = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=400,
    max_depth=3,
    min_samples_leaf=20,
    l2_regularization=5.0,
    random_state=randstate
)

# 3. The Ensemble
ensemble = VotingClassifier(
    estimators=[('lr', clf1), ('gb', clf2)],
    voting='soft', 
    weights=[1, 2] 
)

# 4. Train & Evaluate
cv_scores = cross_val_score(ensemble, X, Y, cv=5, scoring='roc_auc')
print(f"Ensemble CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# 5. Final Fit & Optimization
ensemble.fit(X, Y)

# Optimize Threshold
train_probs = ensemble.predict_proba(X)[:, 1]
best_tr = 0.5
best_acc = 0
for tr in np.arange(0.3, 0.7, 0.01):
    acc = accuracy_score(Y, train_probs >= tr)
    if acc > best_acc:
        best_acc = acc
        best_tr = tr

print(f"Best Threshold: {best_tr}")

# 6. Predict & Export
final_probs = ensemble.predict_proba(X_test_submission)[:, 1]
final_preds = (final_probs >= best_tr)

output = pd.DataFrame({'Id': test_df['Id'], 'FreePass': final_preds})
output.to_csv(path + "/results/submission_ensemble_fixed_JAN12.csv", index=False)



Ensemble CV AUC: 1.0000 (+/- 0.0000)




Best Threshold: 0.5800000000000003


In [29]:
def preprocess_csv_improved(path, save=False, filename=""):
    data = pd.read_csv(path)
    
    # --- 1. NLP & Extraction ---
    data = get_text_features(data)
    
    data['Concert_StartTime'] = data['Concert'].astype(str).str.extract(r'([A-Za-z]+)(\d+)-', expand=False)[1].astype(float) 
    data['Concert_EndTime'] = data['Concert'].astype(str).str.extract(r'-(\d+)pm').astype(float)
    data['Concert_Duration'] = 12 + data['Concert_EndTime'] - data['Concert_StartTime']

    data[['Ticket_Type', 'Ticket_Class']] = data['TicketInfo'].apply(lambda x: pd.Series(split_ticket(x)))

    # --- 2. Handling Missing Values ---
    cat_cols = ['Ticket_Type', 'Ticket_Class', 'PreferedAlbum', 'Vinyl', 'VIP']
    for col in cat_cols:
        if col in data.columns:
            data[col] = data[col].fillna("Missing")

    # --- 3. Encoding ---
    # CRITICAL CHANGE: We do NOT drop 'Concert' here anymore. 
    # We only drop Opinion and TicketInfo.
    cols_to_drop = ['Opinion', 'TicketInfo'] 
    data = data.drop(columns=cols_to_drop, errors='ignore')
    
    # One-Hot Encoding (Concert is NOT in cat_cols, so it stays as a string column)
    df_encoded = pd.get_dummies(data, columns=cat_cols, drop_first=True)
    
    return df_encoded

In [30]:
# --- PROCESS DATA ---
train_df = preprocess_csv_improved(path+'/train.csv')
test_df = preprocess_csv_improved(path+'/test.csv')

# --- APPLY FEATURE ENGINEERING ---
# Now this works because 'Concert' is still in the dataframe
train_df = get_advanced_features(train_df)
test_df = get_advanced_features(test_df)

# Align columns (Test might miss some dummy columns)
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

# --- SETUP GROUPS & X/Y ---

# 1. Define Groups using the 'Concert' column
groups = train_df['Concert'] 

# 2. Define Drop List
# We NOW drop 'Concert' here, right before training
# We also drop 'AvgTime' to test the Realistic Score (No Leak)
drop_cols = ["Id", "FreePass", "Opinion_raw", "Opinion_clean", "Opinion", "Concert", "AvgTime"] 

X = train_df.drop(columns=drop_cols, errors='ignore')
Y = train_df["FreePass"]

# --- ROBUST VALIDATION ---
gkf = GroupKFold(n_splits=5)

# Re-initialize your ensemble (make sure it's defined)
ensemble = VotingClassifier(
    estimators=[('lr', clf1), ('gb', clf2)],
    voting='soft', 
    weights=[1, 2] 
)

# Run Group Cross-Validation
cv_scores = cross_val_score(ensemble, X, Y, groups=groups, cv=gkf, scoring='roc_auc')

print(f"Realistic Group-CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

ValueError: Input contains NaN