In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier

In [None]:
# Load data
df = pd.read_csv('train.csv')
df['labels'] = [0 if x == 'nonevent' else 1 for x in df['class4']]

# Features and labels
X = df.drop(columns=['id', 'class4', 'labels', 'date'])
y = df['labels']

# Split data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Logistic with $L_1$ Regularization

In [None]:
# Create a pipeline with scaling and logistic regression
pipe_l1 = Pipeline([('scaler', StandardScaler()),  
                    ('clf', LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter=5000, random_state=42))
])

In [114]:
# Hyperparameter tuning using GridSearchCV
param_grid = {'clf__C': [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]}

grid = GridSearchCV(pipe_l1, param_grid, cv=5, scoring='roc_auc')
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)



{'clf__C': 0.5}
0.9494320987654321


In [115]:
# Train the model with the best parameters
pipe_l1 = Pipeline([('scaler', StandardScaler()),  
                    ('clf', LogisticRegression(penalty='l1', solver='saga', C=0.5, max_iter=5000, random_state=42))
]) 
pipe_l1.fit(X_train, y_train)

# Evaluate the model
y_pred = pipe_l1.predict(X_valid)
y_prob = pipe_l1.predict_proba(X_valid)[:, 1]
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob))

# Feature selection based on L1 regularization
coef = pipe_l1.named_steps['clf'].coef_[0]
selected_features = X.columns[coef != 0]
print("\nFeatures selected by L1 regularization:", len(selected_features))
print("Selected features by L1:", selected_features)

Accuracy: 0.9
ROC-AUC: 0.9446913580246913

Features selected by L1 regularization: 29
Selected features by L1: Index(['CO242.mean', 'Glob.mean', 'H2O336.mean', 'H2O504.mean', 'H2O672.mean',
       'NO42.std', 'NOx42.std', 'NOx504.std', 'O342.mean', 'O342.std',
       'O3672.mean', 'O384.mean', 'O384.std', 'Pamb0.mean', 'Pamb0.std',
       'PTG.mean', 'PTG.std', 'RGlob.mean', 'RHIRGA42.std', 'RHIRGA504.mean',
       'RHIRGA672.mean', 'SO2168.mean', 'SO2168.std', 'SWS.mean', 'T42.mean',
       'T504.mean', 'T672.mean', 'CS.mean', 'CS.std'],
      dtype='object')


In [116]:
# Use selected features to train a new model
X_selected = df[selected_features]
pipe_l1_selected = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter=5000, random_state=42))
])
pipe_l1_selected.fit(X_selected, y)

# Evaluate the model
y_pred = pipe_l1_selected.predict(X_valid[selected_features])
y_prob = pipe_l1_selected.predict_proba(X_valid[selected_features])[:, 1]
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob))

Accuracy: 0.9
ROC-AUC: 0.9525925925925927


In [117]:
# Predict on test set
df_test = pd.read_csv('test.csv')
X_test = df_test[selected_features]
y_test_prob = pipe_l1_selected.predict_proba(X_test)[:, 1]

# Binary class hard prediction
y_test_pred_class2 = (y_test_prob >= 0.5).astype(int)

# Map to class4 (simple strategy: all events are considered Ia)
class4_pred = np.where(y_test_pred_class2 == 1, 'Ia', 'nonevent')

# Submission file
submission = pd.DataFrame({
    'id': df_test['id'],
    'class4': class4_pred,
    'p': y_test_prob
})
submission.to_csv('submission_logistic.csv', index=False)

# Random Forest

In [None]:
# Random Forest with Hyperparameter Tuning
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Hayperparameter grid
param_grid = {
    'n_estimators': [200, 500],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 0.5, 1.0],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

grid = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)

grid.fit(X, y)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.5s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.5s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.5s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.6s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=200; total time=   0.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=2, n_estimators=200; total time=   0.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, n_estimators=5

In [None]:
# Train the Random Forest with the best parameters
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,          
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='sqrt',     
    bootstrap=True,
    random_state=42,
    n_jobs=-1                
)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# Evaluate the Random Forest model
y_valid_pred = rf.predict(X_valid)
y_valid_prob = rf.predict_proba(X_valid)[:, 1]

print("Random Forest Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("Random Forest AUC:", roc_auc_score(y_valid, y_valid_prob))

Random Forest Accuracy: 0.8666666666666667
Random Forest AUC: 0.9323456790123457


In [122]:
# Predict on test set
df_test = pd.read_csv('test.csv')
X_test = df_test[selected_features]
y_test_prob_rf = pipe_l1_selected.predict_proba(X_test)[:, 1]

# Binary class hard prediction
y_test_pred_rf = (y_test_prob_rf >= 0.5).astype(int)

# Map to class4 (simple strategy: all events are considered Ia)
class4_pred = np.where(y_test_pred_rf == 1, 'Ia', 'nonevent')

submission_rf = pd.DataFrame({
    'id': df_test['id'],
    'class4': class4_pred,
    'p': y_test_prob_rf
})

submission_rf.to_csv('submission_rf.csv', index=False)

In [151]:
# Obtain the important features from Random Forest
feature_importances = rf.feature_importances_
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(importance_df.head(20))

Top 20 Most Important Features:
           feature  importance
67  RHIRGA168.mean    0.058360
73  RHIRGA504.mean    0.056455
69  RHIRGA336.mean    0.052306
71   RHIRGA42.mean    0.050900
77   RHIRGA84.mean    0.049793
75  RHIRGA672.mean    0.043025
78    RHIRGA84.std    0.024097
72    RHIRGA42.std    0.023264
9        Glob.mean    0.022246
99         CS.mean    0.021248
10        Glob.std    0.019970
13     H2O336.mean    0.019965
61        PAR.mean    0.019323
68   RHIRGA168.std    0.018911
11     H2O168.mean    0.017897
17     H2O504.mean    0.017275
24         NET.std    0.017139
15      H2O42.mean    0.016313
19     H2O672.mean    0.015461
23        NET.mean    0.014546


# XGBoost

In [120]:
# XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=600,        
    max_depth=4,          
    learning_rate=0.03,     
    subsample=0.8,          
    colsample_bytree=0.8,  
    objective='binary:logistic',
    eval_metric='logloss',  
    random_state=42,
    n_jobs=-1
)

xgb.fit(X_train, y_train)

# Evaluate the XGBoost model
y_valid_prob = xgb.predict_proba(X_valid)[:, 1]
y_valid_pred = (y_valid_prob >= 0.5).astype(int)
print("Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("AUC:", roc_auc_score(y_valid, y_valid_prob))
print("Log-loss:", log_loss(y_valid, y_valid_prob))

Accuracy: 0.8666666666666667
AUC: 0.9264197530864198
Log-loss: 0.40732917191029167


In [124]:
# Predict on test set
df_test = pd.read_csv('test.csv')
X_test = df_test[selected_features]
y_test_prob_xgb = pipe_l1_selected.predict_proba(X_test)[:, 1]

# Binary class hard prediction
y_test_pred_xgb = (y_test_prob_xgb >= 0.5).astype(int)

# Map to class4 (simple strategy: all events are considered Ia)
class4_pred = np.where(y_test_pred_xgb == 1, 'Ia', 'nonevent')

submission_xgb = pd.DataFrame({
    'id': df_test['id'],
    'class4': class4_pred,
    'p': y_test_prob_xgb
})

submission_xgb.to_csv('submission_xgb.csv', index=False)

# SVM

In [None]:
# Hyperparameter tuning for SVM with L1 regularization
param_grid = {'clf__C': [0.01, 0.05, 0.1, 0.5, 1.0]}

grid_svm = GridSearchCV(estimator=svm_l1, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)

grid_svm.fit(X, y)

print("Best params:", grid_svm.best_params_)
print("Best CV score:", grid_svm.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ................clf__C=0.05, clf__class_weight=None; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ................clf__C=0.05, clf__class



In [129]:
# Train the model with the best parameters
svm_l1 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=0.1, max_iter=5000, random_state=42))
])

svm_l1.fit(X_train, y_train)

# Evaluate the model
y_valid_pred = svm_l1.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("AUC (using decision_function):", 
      roc_auc_score(y_valid, svm_l1.decision_function(X_valid)))

# Selected features based on SVM with L1 regularization
coef = svm_l1.named_steps['clf'].coef_[0]
selected_features_svm = X.columns[coef != 0]
print("Number of selected features:", len(selected_features_svm))
print(selected_features_svm)

Accuracy: 0.9
AUC (using decision_function): 0.9432098765432099
Number of selected features: 20
Index(['CO242.mean', 'Glob.mean', 'H2O672.mean', 'NO42.std', 'NOx42.std',
       'NOx504.std', 'O342.mean', 'O342.std', 'Pamb0.mean', 'Pamb0.std',
       'PTG.mean', 'PTG.std', 'RGlob.std', 'RHIRGA42.std', 'RHIRGA672.mean',
       'SO2168.mean', 'SO2168.std', 'SWS.mean', 'CS.mean', 'CS.std'],
      dtype='object')


In [None]:
# Retrain model with selected features
pipe_l1_selected = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(kernel='linear', C=1.0, probability=True, random_state=42))
])
pipe_l1_selected.fit(X[selected_features_svm], y)

# Evaluate the model
y_valid_pred = pipe_l1_selected.predict(X_valid[selected_features_svm])
print("Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("AUC (using decision_function):", 
      roc_auc_score(y_valid, pipe_l1_selected.decision_function(X_valid[selected_features_svm])))

Accuracy: 0.9
AUC (using decision_function): 0.9466666666666665


In [149]:
# Predict on test set
df_test = pd.read_csv('test.csv')
X_test = df_test[selected_features_svm]

# Calibrated classifier for probability estimates
svm_l1_calibrated = CalibratedClassifierCV(estimator=pipe_l1_selected,method='sigmoid',cv=5)
svm_l1_calibrated.fit(X[selected_features_svm], y)
y_test_prob_svm = svm_l1_calibrated.predict_proba(X_test[selected_features_svm])[:, 1]

# Binary class hard prediction
y_test_pred_class2 = (y_test_prob_svm >= 0.5).astype(int)
class4_pred = np.where(y_test_pred_class2 == 1, 'Ia', 'nonevent')

submission_xgb = pd.DataFrame({
    'id': df_test['id'],
    'class4': class4_pred,
    'p': y_test_prob_svm
})

submission_xgb.to_csv('submission_svm.csv', index=False)