In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns


In [22]:
import pandas as pd

# Cargar datasets directamente
datasets = {
    'K2': pd.read_csv('k2pandc_2025.10.01_17.20.56.csv', comment='#', low_memory=False),
    'Kepler': pd.read_csv('cumulative_2025.10.01_17.21.27.csv', comment='#', low_memory=False),
    'TESS': pd.read_csv('TOI_2025.10.01_17.21.12.csv', comment='#', low_memory=False)
}

# Información básica de cada dataset
for mission, df in datasets.items():
    print(f"\n{'='*80}")
    print(f"{mission}: {df.shape[0]} filas x {df.shape[1]} columnas")
    print(f"{'='*80}")

    # Identificar columna de clasificación
    if mission == 'K2':
        target_col = 'disposition'
    elif mission == 'Kepler':
        target_col = 'koi_disposition'
    else:  # TESS
        target_col = 'tfopwg_disp'

    # Distribución de clases
    print(f"\nColumna objetivo: {target_col}")
    print(f"\nDistribución de clases:")
    print(df[target_col].value_counts())
    print(f"\nValores nulos: {df[target_col].isna().sum()}")

    # Primeras filas
    print(f"\nPrimeras 10 filas (columnas clave):")
    key_cols = [col for col in df.columns[:10]]
    print(df[key_cols].head(10))

print(f"\n{'='*80}")
print("Carga completada")
print(f"{'='*80}")


K2: 4004 filas x 295 columnas

Columna objetivo: disposition

Distribución de clases:
disposition
CONFIRMED         2315
CANDIDATE         1374
FALSE POSITIVE     293
REFUTED             22
Name: count, dtype: int64

Valores nulos: 0

Primeras 10 filas (columnas clave):
   rowid            pl_name        hostname pl_letter  k2_name  \
0      1        BD+20 594 b       BD+20 594         b  K2-56 b   
1      2        BD+20 594 b       BD+20 594         b  K2-56 b   
2      3        BD+20 594 b       BD+20 594         b  K2-56 b   
3      4  EPIC 201111557.01  EPIC 201111557       NaN      NaN   
4      5  EPIC 201111557.01  EPIC 201111557       NaN      NaN   
5      6  EPIC 201126503.01  EPIC 201126503       NaN      NaN   
6      7  EPIC 201127519.01  EPIC 201127519       NaN      NaN   
7      8  EPIC 201127519.01  EPIC 201127519       NaN      NaN   
8      9  EPIC 201147085.01  EPIC 201147085       NaN      NaN   
9     10  EPIC 201152065.01  EPIC 201152065       NaN      NaN   

 

In [23]:
import pandas as pd

df = pd.read_csv('cumulative_2025.10.01_17.21.27.csv', comment='#', low_memory=False)
df.head(10)

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,2018-08-16,CANDIDATE,0.969,0,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,3,10811496,K00753.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.0,0,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.0,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2
5,6,10872983,K00756.01,Kepler-228 d,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,...,-0.07,0.11,0.08,0.13,-0.02,0.13,-0.08,0.1,0.08,0.1
6,7,10872983,K00756.02,Kepler-228 c,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,...,0.09,0.16,0.26,0.16,0.18,0.15,0.06,0.15,0.19,0.17
7,8,10872983,K00756.03,Kepler-228 b,CONFIRMED,Done,2018-08-16,CANDIDATE,0.992,0,...,-0.3,0.21,0.45,0.28,-0.41,0.33,-0.29,0.21,0.5,0.3
8,9,6721123,K00114.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.0,0,...,7.71,0.072,8.93,0.074,-4.537,0.071,7.713,0.074,8.948,0.077
9,10,10910878,K00757.01,Kepler-229 c,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,...,-0.015,0.073,0.044,0.075,0.005,0.075,0.03,0.082,0.031,0.081


# Analisis NULL values

In [24]:
null_analysis = pd.DataFrame({
    'Colum': df.columns,
    'NULL_values': df.isnull().sum(),
    'NULL_percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

null_analysis = null_analysis[null_analysis['NULL_values'] > 0].sort_values('NULL_percentage', ascending=False)

print(f"Total columns: {len(df.columns)}")
print(f"Columns with null: {len(null_analysis)}")
print(f"Columns without null: {len(df.columns) - len(null_analysis)}\n")

print(null_analysis.to_string(index=False))

Total columns: 141
Columns with null: 120
Columns without null: 21

             Colum  NULL_values  NULL_percentage
  koi_ingress_err1         9564           100.00
  koi_ingress_err2         9564           100.00
       koi_ingress         9564           100.00
    koi_longp_err2         9564           100.00
         koi_longp         9564           100.00
    koi_longp_err1         9564           100.00
    koi_eccen_err1         9564           100.00
    koi_eccen_err2         9564           100.00
     koi_incl_err2         9564           100.00
      koi_sma_err2         9564           100.00
      koi_teq_err2         9564           100.00
      koi_teq_err1         9564           100.00
      koi_sma_err1         9564           100.00
     koi_incl_err1         9564           100.00
     koi_model_dof         9564           100.00
   koi_model_chisq         9564           100.00
     koi_sage_err1         9564           100.00
     koi_sage_err2         9564           100.00
 

# Elimination of the colums with 100% NULL values

In [25]:
null_cols = [col for col in df.columns if df[col].isna().sum() == len(df)]
df_clean = df.drop(columns=null_cols)
target = 'koi_disposition'

# koi_disposition column distribution

In [26]:

target_counts = df_clean[target].value_counts()
colors = ['#E63946', '#457B9D', '#2A9D8F']

fig = go.Figure(data=[go.Pie(
    labels=target_counts.index,
    values=target_counts.values,
    hole=0.5,
    marker=dict(colors=colors, line=dict(color='#FFFFFF', width=2)),
    textinfo='label+percent',
    textfont=dict(size=14, color='white'),
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
)])

fig.update_layout(
    title=dict(text='Class Distribution: koi_disposition', font=dict(size=18), x=0.5),
    showlegend=False,
    height=500,
    paper_bgcolor='white'
)

fig.show()




FALSE POSITIVE: 50.6% (4,839)

CONFIRMED: 28.7% (2,746)

CANDIDATE: 20.7% (1,979)

In [27]:
target = 'koi_disposition'

# Correlations between features

In [29]:
import numpy as np
import pandas as pd

# ============================================================================
# COMPREHENSIVE LEAKAGE DETECTION AND FEATURE CORRELATION ANALYSIS
# ============================================================================

print("="*80)
print("STEP 1: CORRELATION ANALYSIS BEFORE LEAKAGE REMOVAL")
print("="*80)

# Encode target (ordinal encoding for correlation analysis)
# Note: This assumes ordered relationship, but provides interpretable correlation values
target_encoded = df_clean[target].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2})

# Select numeric features
numeric_features = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
exclude_cols = ['rowid', 'kepid']
numeric_features = [col for col in numeric_features if col not in exclude_cols]

# Calculate correlations BEFORE leakage removal
correlations_before = []
for col in numeric_features:
    mask = df_clean[col].notna() & target_encoded.notna()
    if mask.sum() > 0:
        corr = np.corrcoef(df_clean.loc[mask, col], target_encoded[mask])[0, 1]
        correlations_before.append((col, corr, abs(corr) if not np.isnan(corr) else 0))

correlations_before_df = pd.DataFrame(
    correlations_before,
    columns=['feature', 'correlation', 'abs_correlation']
).sort_values('abs_correlation', ascending=False)

print("\nTop 15 features correlated with target (BEFORE leakage removal):\n")
print(correlations_before_df.head(15)[['feature', 'correlation', 'abs_correlation']].to_string(index=False))

# ============================================================================
# STEP 2: DETAILED LEAKAGE VERIFICATION
# ============================================================================

print("\n" + "="*80)
print("STEP 2: LEAKAGE VERIFICATION FOR SUSPICIOUS FEATURES")
print("="*80)

# Identify potential leakage columns
potential_leakage = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
                     'koi_fpflag_co', 'koi_fpflag_ec']
potential_leakage = [col for col in potential_leakage if col in df_clean.columns]

for col in potential_leakage:
    print(f"\n{'─'*80}")
    print(f"Feature: {col}")
    print(f"{'─'*80}")

    if df_clean[col].dtype in ['float64', 'int64']:
        # Numeric feature: show statistics by class
        stats = df_clean.groupby(target)[col].describe()[['count', 'mean', 'std', 'min', 'max']]
        print(stats.round(3))

        # Calculate separation metric (coefficient of variation between means)
        means = df_clean.groupby(target)[col].mean()
        print(f"\nMean by class:\n{means.round(3)}")
        print(f"Range of means: {means.min():.3f} to {means.max():.3f}")
        print(f"Separation ratio: {(means.max() - means.min()) / (means.std() + 1e-10):.2f}")
    else:
        # Categorical feature: show distribution
        print("\nValue distribution by class:")
        crosstab = pd.crosstab(df_clean[target], df_clean[col], margins=True)
        print(crosstab)

# ============================================================================
# STEP 3: REMOVE LEAKAGE COLUMNS
# ============================================================================

print("\n" + "="*80)
print("STEP 3: REMOVING LEAKAGE COLUMNS")
print("="*80)

leakage_cols = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
                'koi_fpflag_co', 'koi_fpflag_ec']
leakage_cols_present = [col for col in leakage_cols if col in df_clean.columns]

print(f"\nColumns identified as leakage: {leakage_cols_present}")
print(f"Columns to remove: {len(leakage_cols_present)}")

df_clean = df_clean.drop(columns=leakage_cols_present)
print(f"Remaining columns: {len(df_clean.columns)}")

# ============================================================================
# STEP 4: RECALCULATE CORRELATIONS AFTER LEAKAGE REMOVAL
# ============================================================================

print("\n" + "="*80)
print("STEP 4: CORRELATION ANALYSIS AFTER LEAKAGE REMOVAL")
print("="*80)

# Update numeric features list
numeric_features = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_features = [col for col in numeric_features if col not in exclude_cols]

# Recalculate correlations AFTER leakage removal
correlations_after = []
for col in numeric_features:
    mask = df_clean[col].notna() & target_encoded.notna()
    if mask.sum() > 0:
        corr = np.corrcoef(df_clean.loc[mask, col], target_encoded[mask])[0, 1]
        correlations_after.append((col, corr, abs(corr) if not np.isnan(corr) else 0))

correlations_after_df = pd.DataFrame(
    correlations_after,
    columns=['feature', 'correlation', 'abs_correlation']
).sort_values('abs_correlation', ascending=False)

print("\nTop 15 features correlated with target (AFTER leakage removal):\n")
print(correlations_after_df.head(15)[['feature', 'correlation', 'abs_correlation']].to_string(index=False))

# ============================================================================
# STEP 5: COMPARISON OF CORRELATIONS (BEFORE vs AFTER)
# ============================================================================

print("\n" + "="*80)
print("STEP 5: IMPACT OF LEAKAGE REMOVAL ON CORRELATIONS")
print("="*80)

# Merge before and after correlations
comparison = correlations_before_df[['feature', 'abs_correlation']].merge(
    correlations_after_df[['feature', 'abs_correlation']],
    on='feature',
    how='inner',
    suffixes=('_before', '_after')
)

comparison['change'] = comparison['abs_correlation_after'] - comparison['abs_correlation_before']
comparison['pct_change'] = (comparison['change'] / (comparison['abs_correlation_before'] + 1e-10) * 100).round(1)

# Show top features that increased in importance
print("\nTop 10 features with INCREASED importance after leakage removal:")
print(comparison.sort_values('change', ascending=False).head(10).to_string(index=False))

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"\nFeatures analyzed: {len(comparison)}")
print(f"Mean absolute correlation before: {comparison['abs_correlation_before'].mean():.4f}")
print(f"Mean absolute correlation after: {comparison['abs_correlation_after'].mean():.4f}")
print(f"Features removed due to leakage: {len(leakage_cols_present)}")

# Identify features that were among top correlators before but are now gone
removed_top_features = set(correlations_before_df.head(10)['feature']) - set(correlations_after_df['feature'])
if removed_top_features:
    print(f"\nHigh-correlation features removed (were in top 10): {removed_top_features}")

print("\n" + "="*80)
print("LEAKAGE REMOVAL COMPLETE")
print("="*80)

STEP 1: CORRELATION ANALYSIS BEFORE LEAKAGE REMOVAL

Top 15 features correlated with target (BEFORE leakage removal):

         feature  correlation  abs_correlation
       koi_score     0.886311         0.886311
   koi_fpflag_ss    -0.489804         0.489804
   koi_fpflag_co    -0.448935         0.448935
       koi_count     0.436286         0.436286
   koi_smet_err2     0.414951         0.414951
  koi_steff_err1    -0.391724         0.391724
  koi_dicco_msky    -0.377220         0.377220
   koi_smet_err1    -0.373254         0.373254
  koi_dikco_msky    -0.369599         0.369599
  koi_steff_err2     0.354306         0.354306
   koi_fpflag_ec    -0.334214         0.334214
        koi_incl     0.318589         0.318589
  koi_smass_err1    -0.312688         0.312688
koi_num_transits    -0.299913         0.299913
        koi_smet     0.294549         0.294549

STEP 2: LEAKAGE VERIFICATION FOR SUSPICIOUS FEATURES

──────────────────────────────────────────────────────────────────────────


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide




Top 15 features correlated with target (AFTER leakage removal):

         feature  correlation  abs_correlation
       koi_count     0.436286         0.436286
   koi_smet_err2     0.414951         0.414951
  koi_steff_err1    -0.391724         0.391724
  koi_dicco_msky    -0.377220         0.377220
   koi_smet_err1    -0.373254         0.373254
  koi_dikco_msky    -0.369599         0.369599
  koi_steff_err2     0.354306         0.354306
        koi_incl     0.318589         0.318589
  koi_smass_err1    -0.312688         0.312688
koi_num_transits    -0.299913         0.299913
        koi_smet     0.294549         0.294549
         koi_teq    -0.277026         0.277026
       koi_depth    -0.253526         0.253526
  koi_slogg_err2     0.249548         0.249548
koi_fwm_stat_sig     0.246029         0.246029

STEP 5: IMPACT OF LEAKAGE REMOVAL ON CORRELATIONS

Top 10 features with INCREASED importance after leakage removal:
         feature  abs_correlation_before  abs_correlation_after  

In [11]:
# Eliminar columnas con leakage
leakage_cols = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec']
leakage_cols = [col for col in leakage_cols if col in df_clean.columns]

df_clean = df_clean.drop(columns=leakage_cols)

# Recalcular correlaciones sin leakage
target_encoded = df_clean[target].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2})
numeric_features = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_features = [col for col in numeric_features if col not in ['rowid', 'kepid']]

correlations = []
for col in numeric_features:
    mask = df_clean[col].notna() & target_encoded.notna()
    if mask.sum() > 0:
        corr = np.corrcoef(df_clean.loc[mask, col], target_encoded[mask])[0, 1]
        correlations.append((col, abs(corr) if not np.isnan(corr) else 0))

correlations_df = pd.DataFrame(correlations, columns=['feature', 'correlation']).sort_values('correlation', ascending=False)

print("Top 15 features sin leakage:\n")
print(correlations_df.head(15).to_string(index=False))


invalid value encountered in divide


invalid value encountered in divide



Top 15 features sin leakage:

         feature  correlation
       koi_count     0.436286
   koi_smet_err2     0.414951
  koi_steff_err1     0.391724
  koi_dicco_msky     0.377220
   koi_smet_err1     0.373254
  koi_dikco_msky     0.369599
  koi_steff_err2     0.354306
        koi_incl     0.318589
  koi_smass_err1     0.312688
koi_num_transits     0.299913
        koi_smet     0.294549
         koi_teq     0.277026
       koi_depth     0.253526
  koi_slogg_err2     0.249548
koi_fwm_stat_sig     0.246029


In [30]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# COMPREHENSIVE FEATURE ANALYSIS AFTER LEAKAGE REMOVAL
# ============================================================================

print("="*80)
print("FEATURE RELEVANCE ANALYSIS - MULTIPLE METRICS")
print("="*80)

# Remove leakage columns
leakage_cols = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
                'koi_fpflag_co', 'koi_fpflag_ec']
leakage_cols_present = [col for col in leakage_cols if col in df_clean.columns]
df_clean = df_clean.drop(columns=leakage_cols_present)

print(f"\nLeakage columns removed: {leakage_cols_present}")
print(f"Remaining columns: {len(df_clean.columns)}")

# Encode target
target_encoded = df_clean[target].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2})

# Select numeric features
numeric_features = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
exclude_cols = ['rowid', 'kepid']
numeric_features = [col for col in numeric_features if col not in exclude_cols]

print(f"Numeric features to analyze: {len(numeric_features)}")

# ============================================================================
# METRIC 1: PEARSON CORRELATION (LINEAR)
# ============================================================================

print("\n" + "-"*80)
print("METRIC 1: Pearson Correlation (linear relationship)")
print("-"*80)

correlations = []
for col in numeric_features:
    mask = df_clean[col].notna() & target_encoded.notna()

    if mask.sum() > 10:
        values = df_clean.loc[mask, col]
        if values.std() > 1e-10:
            corr = np.corrcoef(values, target_encoded[mask])[0, 1]
            if not np.isnan(corr):
                correlations.append((col, corr, abs(corr)))

correlation_df = pd.DataFrame(
    correlations,
    columns=['feature', 'correlation', 'abs_correlation']
).sort_values('abs_correlation', ascending=False)

print(f"\nValid features for correlation: {len(correlation_df)}")
print("\nTop 15 features by Pearson correlation:")
print(correlation_df.head(15)[['feature', 'correlation', 'abs_correlation']].to_string(index=False))

# ============================================================================
# METRIC 2: MUTUAL INFORMATION (NON-LINEAR)
# ============================================================================

print("\n" + "-"*80)
print("METRIC 2: Mutual Information (non-linear relationships)")
print("-"*80)

features_complete = []
for col in numeric_features:
    if df_clean[col].notna().sum() > 100:
        features_complete.append(col)

df_mi = df_clean[features_complete + [target]].dropna()

if len(df_mi) > 100:
    X_mi = df_mi[features_complete]
    y_mi = df_mi[target].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2})

    mi_scores = mutual_info_classif(X_mi, y_mi, random_state=42, n_neighbors=5)

    mi_df = pd.DataFrame({
        'feature': features_complete,
        'mutual_information': mi_scores
    }).sort_values('mutual_information', ascending=False)

    print(f"\nFeatures analyzed with MI: {len(mi_df)}")
    print("\nTop 15 features by Mutual Information:")
    print(mi_df.head(15).to_string(index=False))
else:
    print("Insufficient complete data for mutual information analysis")
    mi_df = pd.DataFrame(columns=['feature', 'mutual_information'])

# ============================================================================
# METRIC 3: RANDOM FOREST IMPORTANCE (NON-LINEAR + INTERACTIONS)
# ============================================================================

print("\n" + "-"*80)
print("METRIC 3: Random Forest Importance (non-linear + interactions)")
print("-"*80)

if len(df_mi) > 100:
    rf_quick = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    rf_quick.fit(X_mi, y_mi)

    rf_importance_df = pd.DataFrame({
        'feature': features_complete,
        'rf_importance': rf_quick.feature_importances_
    }).sort_values('rf_importance', ascending=False)

    print(f"\nFeatures analyzed with RF: {len(rf_importance_df)}")
    print("\nTop 15 features by Random Forest Importance:")
    print(rf_importance_df.head(15).to_string(index=False))
else:
    print("Insufficient complete data for Random Forest analysis")
    rf_importance_df = pd.DataFrame(columns=['feature', 'rf_importance'])

# ============================================================================
# COMBINED RANKING
# ============================================================================

print("\n" + "="*80)
print("COMBINED FEATURE RANKING")
print("="*80)

combined = correlation_df[['feature', 'abs_correlation']].copy()

if not mi_df.empty:
    combined = combined.merge(mi_df[['feature', 'mutual_information']], on='feature', how='outer')

if not rf_importance_df.empty:
    combined = combined.merge(rf_importance_df[['feature', 'rf_importance']], on='feature', how='outer')

# Normalize each metric to 0-1 range
for col in ['abs_correlation', 'mutual_information', 'rf_importance']:
    if col in combined.columns:
        combined[f'{col}_norm'] = (
            (combined[col] - combined[col].min()) /
            (combined[col].max() - combined[col].min() + 1e-10)
        )

norm_cols = [c for c in combined.columns if c.endswith('_norm')]
combined['avg_score'] = combined[norm_cols].mean(axis=1)
combined = combined.sort_values('avg_score', ascending=False)

print("\nTop 20 features by combined score:")
print(combined.head(20)[['feature', 'abs_correlation', 'mutual_information',
                         'rf_importance', 'avg_score']].to_string(index=False))

# ============================================================================
# PHYSICAL FEATURE VALIDATION
# ============================================================================

print("\n" + "="*80)
print("PHYSICAL FEATURE VALIDATION")
print("="*80)

critical_features = ['koi_period', 'koi_duration', 'koi_depth', 'koi_prad',
                     'koi_impact', 'koi_teq', 'koi_insol']
critical_present = [f for f in critical_features if f in combined['feature'].values]

print(f"\nCritical physical features: {len(critical_present)}/{len(critical_features)}")
print("\nRanking of critical physical features:")

for feat in critical_present:
    row = combined[combined['feature'] == feat].iloc[0]
    rank = combined[combined['feature'] == feat].index[0] + 1
    print(f"\n{feat}: Rank {rank}/{len(combined)}")
    if 'abs_correlation' in combined.columns:
        print(f"  Correlation: {row['abs_correlation']:.4f}")
    if 'mutual_information' in combined.columns:
        print(f"  Mutual Info: {row['mutual_information']:.4f}")
    if 'rf_importance' in combined.columns:
        print(f"  RF Importance: {row['rf_importance']:.4f}")
    print(f"  Combined Score: {row['avg_score']:.4f}")

# ============================================================================
# RECOMMENDATIONS
# ============================================================================

print("\n" + "="*80)
print("FEATURE SELECTION RECOMMENDATIONS")
print("="*80)

top_combined = combined.head(30)['feature'].tolist()
recommended_features = list(set(top_combined + critical_present))

print(f"\nRecommended features: {len(recommended_features)}")
print(f"\nFeature list:")
for i, feat in enumerate(sorted(recommended_features), 1):
    print(f"  {i:2d}. {feat}")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

FEATURE RELEVANCE ANALYSIS - MULTIPLE METRICS

Leakage columns removed: []
Remaining columns: 117
Numeric features to analyze: 98

--------------------------------------------------------------------------------
METRIC 1: Pearson Correlation (linear relationship)
--------------------------------------------------------------------------------

Valid features for correlation: 95

Top 15 features by Pearson correlation:
         feature  correlation  abs_correlation
       koi_count     0.436286         0.436286
   koi_smet_err2     0.414951         0.414951
  koi_steff_err1    -0.391724         0.391724
  koi_dicco_msky    -0.377220         0.377220
   koi_smet_err1    -0.373254         0.373254
  koi_dikco_msky    -0.369599         0.369599
  koi_steff_err2     0.354306         0.354306
        koi_incl     0.318589         0.318589
  koi_smass_err1    -0.312688         0.312688
koi_num_transits    -0.299913         0.299913
        koi_smet     0.294549         0.294549
         koi_t

In [12]:
# Estadísticas descriptivas de top 6 features por clase
top_features = correlations_df.head(6)['feature'].tolist()

for feature in top_features:
    print(f"\n{'='*70}")
    print(f"Feature: {feature}")
    print(f"{'='*70}")
    print(df_clean.groupby(target)[feature].describe().round(3))

    # Información adicional
    print(f"\nNulos por clase:")
    null_by_class = df_clean.groupby(target)[feature].apply(lambda x: x.isna().sum())
    print(null_by_class)


Feature: koi_count
                  count   mean    std  min  25%  50%  75%  max
koi_disposition                                               
CANDIDATE        1979.0  1.358  0.762  1.0  1.0  1.0  1.0  7.0
CONFIRMED        2746.0  1.996  1.206  1.0  1.0  2.0  3.0  7.0
FALSE POSITIVE   4839.0  1.091  0.384  1.0  1.0  1.0  1.0  7.0

Nulos por clase:
koi_disposition
CANDIDATE         0
CONFIRMED         0
FALSE POSITIVE    0
Name: koi_count, dtype: int64

Feature: koi_smet_err2
                  count   mean    std   min  25%   50%   75%  max
koi_disposition                                                  
CANDIDATE        1872.0 -0.254  0.077 -0.60 -0.3 -0.30 -0.15  0.0
CONFIRMED        2744.0 -0.200  0.076 -0.70 -0.3 -0.15 -0.15  0.0
FALSE POSITIVE   4561.0 -0.283  0.079 -0.75 -0.3 -0.30 -0.28  0.0

Nulos por clase:
koi_disposition
CANDIDATE         107
CONFIRMED           2
FALSE POSITIVE    278
Name: koi_smet_err2, dtype: int64

Feature: koi_steff_err1
                  count     

In [13]:
# Verificar si features físicas principales (period, duration, depth) son informativas
physical_features = ['koi_period', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_impact']
physical_features = [f for f in physical_features if f in df_clean.columns]

for feature in physical_features:
    print(f"\n{'='*70}")
    print(f"Feature: {feature}")
    print(f"{'='*70}")
    print(df_clean.groupby(target)[feature].describe().round(3))


Feature: koi_period
                  count     mean       std    min    25%     50%      75%  \
koi_disposition                                                             
CANDIDATE        1979.0  167.697  2924.815  0.260  6.299  20.020  118.391   
CONFIRMED        2746.0   27.910    56.539  0.342  5.115  11.350   26.354   
FALSE POSITIVE   4839.0   65.139   131.497  0.242  1.387   5.244   36.293   

                        max  
koi_disposition              
CANDIDATE        129995.778  
CONFIRMED          1071.233  
FALSE POSITIVE     1064.268  

Feature: koi_duration
                  count   mean    std    min    25%    50%    75%      max
koi_disposition                                                           
CANDIDATE        1979.0  5.271  4.865  0.052  2.226  3.608  6.306   44.350
CONFIRMED        2746.0  4.246  2.742  0.428  2.415  3.490  5.270   24.708
FALSE POSITIVE   4839.0  6.546  8.177  0.105  2.518  4.057  7.160  138.540

Feature: koi_depth
                  count  

In [14]:
# Verificar si features físicas principales (period, duration, depth) son informativas
physical_features = ['koi_period', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_impact']
physical_features = [f for f in physical_features if f in df_clean.columns]

for feature in physical_features:
    print(f"\n{'='*70}")
    print(f"Feature: {feature}")
    print(f"{'='*70}")
    print(df_clean.groupby(target)[feature].describe().round(3))


Feature: koi_period
                  count     mean       std    min    25%     50%      75%  \
koi_disposition                                                             
CANDIDATE        1979.0  167.697  2924.815  0.260  6.299  20.020  118.391   
CONFIRMED        2746.0   27.910    56.539  0.342  5.115  11.350   26.354   
FALSE POSITIVE   4839.0   65.139   131.497  0.242  1.387   5.244   36.293   

                        max  
koi_disposition              
CANDIDATE        129995.778  
CONFIRMED          1071.233  
FALSE POSITIVE     1064.268  

Feature: koi_duration
                  count   mean    std    min    25%    50%    75%      max
koi_disposition                                                           
CANDIDATE        1979.0  5.271  4.865  0.052  2.226  3.608  6.306   44.350
CONFIRMED        2746.0  4.246  2.742  0.428  2.415  3.490  5.270   24.708
FALSE POSITIVE   4839.0  6.546  8.177  0.105  2.518  4.057  7.160  138.540

Feature: koi_depth
                  count  

In [15]:
# Patrón de nulos por clase
print("Análisis de nulos por clase:\n")
null_by_class = df_clean.groupby(target).apply(lambda x: x.isnull().sum().sum())
total_by_class = df_clean.groupby(target).size() * len(df_clean.columns)
null_pct = (null_by_class / total_by_class * 100).round(2)

print(pd.DataFrame({'total_nulls': null_by_class, 'pct_nulls': null_pct}))

Análisis de nulos por clase:

                 total_nulls  pct_nulls
koi_disposition                        
CANDIDATE              13412       5.79
CONFIRMED               1135       0.35
FALSE POSITIVE         39339       6.95






In [16]:
# Verificar impacto del filtrado primero
df_model = df_clean[
    (df_clean['koi_depth'] < 100000) &
    (df_clean['koi_prad'] < 100) &
    (df_clean['koi_impact'] <= 1.5) &
    (df_clean['koi_period'] < 1000)
].copy()

print(f"Filas originales: {len(df_clean)}")
print(f"Filas después de filtrar: {len(df_model)}")
print(f"Pérdida: {(1 - len(df_model)/len(df_clean))*100:.2f}%\n")
print("Balance post-filtrado:")
print(df_model[target].value_counts(normalize=True).mul(100).round(2))

Filas originales: 9564
Filas después de filtrar: 8308
Pérdida: 13.13%

Balance post-filtrado:
koi_disposition
FALSE POSITIVE    44.84
CONFIRMED         33.02
CANDIDATE         22.15
Name: proportion, dtype: float64


In [17]:
# Seleccionar features con <10% nulos en df_model
null_pct_model = (df_model.isnull().sum() / len(df_model) * 100)
usable_features = null_pct_model[null_pct_model < 10].index.tolist()

# Remover target y metadata
exclude = ['koi_disposition', 'rowid', 'kepid', 'kepoi_name', 'kepler_name',
           'koi_vet_stat', 'koi_vet_date', 'koi_disp_prov', 'koi_comment', 'koi_pdisposition']
usable_features = [f for f in usable_features if f not in exclude]

print(f"Features utilizables: {len(usable_features)}\n")

# Seleccionar top 20 por correlación de los usables
top_20 = correlations_df[correlations_df['feature'].isin(usable_features)].head(20)['feature'].tolist()

print("Top 20 features seleccionadas:")
for i, feat in enumerate(top_20, 1):
    nulls = df_model[feat].isna().sum()
    print(f"{i:2d}. {feat:20s} - {nulls} nulos ({nulls/len(df_model)*100:.1f}%)")

Features utilizables: 102

Top 20 features seleccionadas:
 1. koi_count            - 0 nulos (0.0%)
 2. koi_smet_err2        - 21 nulos (0.3%)
 3. koi_steff_err1       - 67 nulos (0.8%)
 4. koi_dicco_msky       - 464 nulos (5.6%)
 5. koi_smet_err1        - 21 nulos (0.3%)
 6. koi_dikco_msky       - 438 nulos (5.3%)
 7. koi_steff_err2       - 82 nulos (1.0%)
 8. koi_incl             - 1 nulos (0.0%)
 9. koi_smass_err1       - 67 nulos (0.8%)
10. koi_smet             - 21 nulos (0.3%)
11. koi_teq              - 0 nulos (0.0%)
12. koi_depth            - 0 nulos (0.0%)
13. koi_slogg_err2       - 67 nulos (0.8%)
14. koi_fwm_stat_sig     - 688 nulos (8.3%)
15. koi_model_snr        - 0 nulos (0.0%)
16. koi_tce_plnt_num     - 236 nulos (2.8%)
17. koi_smass_err2       - 67 nulos (0.8%)
18. koi_steff            - 0 nulos (0.0%)
19. koi_ldm_coeff1       - 0 nulos (0.0%)
20. koi_smass            - 0 nulos (0.0%)


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix,
                            f1_score, accuracy_score, cohen_kappa_score,
                            balanced_accuracy_score, make_scorer)
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# ============================================================================
# 1. CARGA Y PREPARACIÓN DE DATOS
# ============================================================================

df = pd.read_csv('cumulative_2025.10.01_17.21.27.csv', comment='#', low_memory=False)

print(f"Dataset original: {df.shape}")
print(f"\nDistribución de clases:\n{df['koi_disposition'].value_counts()}")

# Features (ajusta a las tuyas)
features = [
    'koi_period', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq',
    'koi_insol', 'koi_model_snr', 'koi_steff', 'koi_srad', 'koi_slogg',
    'koi_count', 'koi_num_transits', 'ra', 'dec', 'koi_kepmag',
    'koi_incl', 'koi_impact', 'koi_duration_err1', 'koi_period_err1',
    'koi_time0bk_err1'
]

available_features = [f for f in features if f in df.columns]
print(f"\nFeatures disponibles: {len(available_features)}")

X = df[available_features].fillna(df[available_features].median())
y = df['koi_disposition']

# Mapear a números
label_map = {'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2}
y_num = y.map(label_map)

# ============================================================================
# 2. SPLIT ESTRATIFICADO (80/20 para train+val / test)
# ============================================================================

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y_num, test_size=0.20, random_state=42, stratify=y_num
)

print(f"\nTrain+Val: {len(X_train_val)}, Test: {len(X_test)}")

# Escalar
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val)
X_test_scaled = scaler.transform(X_test)

# ============================================================================
# 3. CROSS-VALIDATION EVALUATION (5-FOLD)
# ============================================================================

print("\n" + "="*80)
print("EVALUACIÓN CON CROSS-VALIDATION (5-fold)")
print("="*80)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Función para evaluar con diferentes configuraciones
def evaluate_with_cv(X, y, model, cv, balance_method=None):
    """
    Evalúa modelo con cross-validation, opcionalmente con balanceo.
    """
    scores = {
        'accuracy': [],
        'f1_macro': [],
        'f1_weighted': [],
        'balanced_accuracy': []
    }

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_fold_train = X[train_idx]
        y_fold_train = y.iloc[train_idx]
        X_fold_val = X[val_idx]
        y_fold_val = y.iloc[val_idx]

        # Aplicar balanceo si se especifica
        if balance_method == 'smote':
            smote = SMOTE(random_state=42, k_neighbors=5)
            X_fold_train, y_fold_train = smote.fit_resample(X_fold_train, y_fold_train)
        elif balance_method == 'borderline-smote':
            bsmote = BorderlineSMOTE(random_state=42, k_neighbors=5)
            X_fold_train, y_fold_train = bsmote.fit_resample(X_fold_train, y_fold_train)

        # Entrenar y predecir
        model.fit(X_fold_train, y_fold_train)
        y_pred = model.predict(X_fold_val)

        # Calcular métricas
        scores['accuracy'].append(accuracy_score(y_fold_val, y_pred))
        scores['f1_macro'].append(f1_score(y_fold_val, y_pred, average='macro'))
        scores['f1_weighted'].append(f1_score(y_fold_val, y_pred, average='weighted'))
        scores['balanced_accuracy'].append(balanced_accuracy_score(y_fold_val, y_pred))

        print(f"  Fold {fold}: Acc={scores['accuracy'][-1]:.4f}, "
              f"F1_macro={scores['f1_macro'][-1]:.4f}")

    # Promedios
    results = {
        metric: {
            'mean': np.mean(values),
            'std': np.std(values)
        }
        for metric, values in scores.items()
    }

    return results

# Modelo base
base_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)

# Evaluar sin balanceo
print("\nSIN BALANCEO:")
results_no_balance = evaluate_with_cv(X_train_val_scaled, y_train_val,
                                      base_model, skf, balance_method=None)

print(f"\nResultados promedio (5-fold):")
for metric, stats in results_no_balance.items():
    print(f"  {metric}: {stats['mean']:.4f} (+/- {stats['std']:.4f})")

# Evaluar con SMOTE
print("\n" + "-"*80)
print("CON SMOTE:")
results_smote = evaluate_with_cv(X_train_val_scaled, y_train_val,
                                 base_model, skf, balance_method='smote')

print(f"\nResultados promedio (5-fold):")
for metric, stats in results_smote.items():
    print(f"  {metric}: {stats['mean']:.4f} (+/- {stats['std']:.4f})")

# Evaluar con Borderline-SMOTE
print("\n" + "-"*80)
print("CON BORDERLINE-SMOTE:")
results_bsmote = evaluate_with_cv(X_train_val_scaled, y_train_val,
                                  base_model, skf, balance_method='borderline-smote')

print(f"\nResultados promedio (5-fold):")
for metric, stats in results_bsmote.items():
    print(f"  {metric}: {stats['mean']:.4f} (+/- {stats['std']:.4f})")

# ============================================================================
# 4. SELECCIONAR MEJOR ENFOQUE
# ============================================================================

approaches = {
    'Sin balanceo': results_no_balance['f1_macro']['mean'],
    'SMOTE': results_smote['f1_macro']['mean'],
    'Borderline-SMOTE': results_bsmote['f1_macro']['mean']
}

best_approach = max(approaches.items(), key=lambda x: x[1])
print(f"\n{'='*80}")
print(f"Mejor enfoque: {best_approach[0]} (F1 macro: {best_approach[1]:.4f})")
print(f"{'='*80}")

# Preparar datos según mejor enfoque
if best_approach[0] == 'SMOTE':
    smote = SMOTE(random_state=42, k_neighbors=5)
    X_train_final, y_train_final = smote.fit_resample(X_train_val_scaled, y_train_val)
elif best_approach[0] == 'Borderline-SMOTE':
    bsmote = BorderlineSMOTE(random_state=42, k_neighbors=5)
    X_train_final, y_train_final = bsmote.fit_resample(X_train_val_scaled, y_train_val)
else:
    X_train_final = X_train_val_scaled
    y_train_final = y_train_val

# ============================================================================
# 5. HYPERPARAMETER TUNING CON GRIDSEARCHCV
# ============================================================================

print("\n" + "="*80)
print("HYPERPARAMETER TUNING")
print("="*80)

param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [200, 300, 500],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}

xgb_tuned = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    random_state=42,
    eval_metric='mlogloss'
)

# GridSearch con 5-fold CV
grid_search = GridSearchCV(
    xgb_tuned,
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_final, y_train_final)

print(f"\nMejores parámetros: {grid_search.best_params_}")
print(f"Mejor CV F1 macro: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

# ============================================================================
# 6. EVALUACIÓN FINAL EN TEST SET
# ============================================================================

print("\n" + "="*80)
print("EVALUACIÓN FINAL EN TEST SET")
print("="*80)

y_test_pred = best_model.predict(X_test_scaled)
y_test_proba = best_model.predict_proba(X_test_scaled)

class_names = ['FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED']

print(f"\nAccuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Macro F1: {f1_score(y_test, y_test_pred, average='macro'):.4f}")
print(f"Weighted F1: {f1_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Cohen's Kappa: {cohen_kappa_score(y_test, y_test_pred):.4f}")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_test_pred):.4f}")

# F1 por clase
f1_per_class = f1_score(y_test, y_test_pred, average=None)
print(f"\nF1 Score por clase:")
for i, name in enumerate(class_names):
    print(f"  {name}: {f1_per_class[i]:.4f}")

# Classification report
print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=class_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print(f"\nConfusion Matrix:")
print(cm)

print(f"\nAccuracy por clase:")
for i, name in enumerate(class_names):
    class_acc = cm[i, i] / cm[i, :].sum()
    print(f"  {name}: {class_acc:.4f}")

# ============================================================================
# 7. VISUALIZACIONES
# ============================================================================

# Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix - Test Set')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300)
plt.close()

# Feature Importance
feature_importance = pd.DataFrame({
    'feature': available_features,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
plt.barh(feature_importance['feature'].head(15),
         feature_importance['importance'].head(15))
plt.xlabel('Importance')
plt.title('Top 15 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
plt.close()

print(f"\nTop 10 Features:")
print(feature_importance.head(10).to_string(index=False))

# ============================================================================
# 8. ANÁLISIS DE CONFIANZA
# ============================================================================

print(f"\n{'='*80}")
print("ANÁLISIS DE CONFIANZA")
print(f"{'='*80}")

max_proba = y_test_proba.max(axis=1)
confidence_bins = pd.cut(max_proba,
                         bins=[0, 0.5, 0.7, 0.85, 1.0],
                         labels=['Baja', 'Media', 'Alta', 'Muy Alta'])

for conf_level in ['Baja', 'Media', 'Alta', 'Muy Alta']:
    mask = confidence_bins == conf_level
    if mask.sum() > 0:
        acc = (y_test[mask].values == y_test_pred[mask]).mean()
        count = mask.sum()
        print(f"{conf_level}: {count} muestras ({count/len(y_test)*100:.1f}%), "
              f"Accuracy: {acc:.4f}")

# ============================================================================
# 9. GUARDAR MODELO Y COMPONENTES
# ============================================================================

joblib.dump(best_model, 'xgboost_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_map, 'label_map.pkl')
joblib.dump(available_features, 'features.pkl')

# Guardar resultados
results_summary = {
    'best_approach': best_approach[0],
    'cv_results': {
        'no_balance': results_no_balance,
        'smote': results_smote,
        'borderline_smote': results_bsmote
    },
    'best_params': grid_search.best_params_,
    'test_metrics': {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'f1_macro': f1_score(y_test, y_test_pred, average='macro'),
        'f1_weighted': f1_score(y_test, y_test_pred, average='weighted'),
        'f1_per_class': f1_per_class.tolist(),
        'confusion_matrix': cm.tolist()
    }
}

joblib.dump(results_summary, 'results_summary.pkl')

print(f"\n{'='*80}")
print("ARCHIVOS GUARDADOS")
print(f"{'='*80}")
print("- xgboost_model.pkl")
print("- scaler.pkl")
print("- label_map.pkl")
print("- features.pkl")
print("- results_summary.pkl")
print("- confusion_matrix.png")
print("- feature_importance.png")

print(f"\n{'='*80}")
print("PIPELINE COMPLETO FINALIZADO")
print(f"{'='*80}")

Dataset original: (9564, 141)

Distribución de clases:
koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2746
CANDIDATE         1979
Name: count, dtype: int64

Features disponibles: 20

Train+Val: 7651, Test: 1913

EVALUACIÓN CON CROSS-VALIDATION (5-fold)

SIN BALANCEO:
  Fold 1: Acc=0.8047, F1_macro=0.7621
  Fold 2: Acc=0.7941, F1_macro=0.7439
  Fold 3: Acc=0.8052, F1_macro=0.7639
  Fold 4: Acc=0.7935, F1_macro=0.7565
  Fold 5: Acc=0.8131, F1_macro=0.7777

Resultados promedio (5-fold):
  accuracy: 0.8021 (+/- 0.0074)
  f1_macro: 0.7608 (+/- 0.0110)
  f1_weighted: 0.7977 (+/- 0.0084)
  balanced_accuracy: 0.7587 (+/- 0.0117)

--------------------------------------------------------------------------------
CON SMOTE:
  Fold 1: Acc=0.7936, F1_macro=0.7574
  Fold 2: Acc=0.7941, F1_macro=0.7613
  Fold 3: Acc=0.7954, F1_macro=0.7623
  Fold 4: Acc=0.7771, F1_macro=0.7467
  Fold 5: Acc=0.7980, F1_macro=0.7734

Resultados promedio (5-fold):
  accuracy: 0.7917 (+/- 0.0074)
  f1_macro: 0.7