In [2]:
import numpy as np
import pandas as pd

In [None]:
pip install catboost

In [41]:
from sklearn.model_selection import StratifiedKFold     #k-fold cross-validation
from sklearn.feature_extraction.text import TfidfVectorizer #feature extraction module; converts raw documents (text) into a matrix of TF-IDF features
from sklearn.decomposition import TruncatedSVD              #decomposition module; used for dimensionality reduction of sparse data
from catboost import CatBoostClassifier, Pool               #gradient boosting library by Yandex that handles categorical features naturally. Pool is used to hold data and can be passed as an argument for training a CatBoost model
from catboost.utils import eval_metric                  #evaluation metrics for CatBoost models, such as accuracy, AUC (Area Under the Curve)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [66]:
RAND_VAL=32
num_folds=20 ## Number of folds
n_est=500 ## Number of estimators

In [None]:
df_train = pd.read_csv('train.csv')
df_train.head()

In [9]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [19]:
for feat in df_train.columns:
  df_train[feat] = df_train[feat].fillna('')
for feat in df_test.columns:
  df_test[feat] = df_test[feat].fillna('')
feat_cols=df_train.columns.drop(['target'])
X=df_train[feat_cols]
y=df_train['target']
cat_features = np.where(X.dtypes != np.float64)[0]
print(cat_features) #everything is categorical

[0 1 2 3]


In [67]:
folds = StratifiedKFold(n_splits=num_folds, random_state=RAND_VAL, shuffle=True)

best_f1_scores = []
optimal_thresholds = []

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)

    clf = CatBoostClassifier(
        eval_metric='BalancedAccuracy',  # or 'Logloss', 'Accuracy'
        custom_metric='AOC',
        task_type='GPU',
        learning_rate=0.02,
        iterations=n_est
    )

    clf.fit(train_pool, eval_set=val_pool, verbose=300)

    # Predict probabilities
    y_pred_val_prob = clf.predict_proba(X_val)[:, 1]

    # Optimize the threshold for the best F1 score
    #thresholds = np.linspace(0, 1, 101)
    #f1_scores = [f1_score(y_val, y_pred_val_prob > thresh) for thresh in thresholds]
    #best_thresh = thresholds[np.argmax(f1_scores)]
    #best_f1 = max(f1_scores)
    #best_f1_scores.append(best_f1)
    #optimal_thresholds.append(best_thresh)

    print(f"Best F1 Score for fold {n_fold}: {best_f1} at threshold {best_thresh}")

    print("----------------")


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6990337	test: 0.7400809	best: 0.7400809 (0)	total: 78.2ms	remaining: 39s
300:	learn: 0.7020851	test: 0.7446190	best: 0.7469231 (240)	total: 9.61s	remaining: 6.35s
499:	learn: 0.7036588	test: 0.7461785	best: 0.7469231 (240)	total: 15s	remaining: 0us
bestTest = 0.7469231202
bestIteration = 240
Shrink model to first 241 iterations.
Best F1 Score for fold 0: 0.7549295774647887 at threshold 0.37
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6964862	test: 0.7630522	best: 0.7630522 (0)	total: 34.4ms	remaining: 17.2s
300:	learn: 0.6999033	test: 0.7661009	best: 0.7661009 (5)	total: 10.6s	remaining: 7.03s
499:	learn: 0.7019953	test: 0.7745026	best: 0.7745026 (480)	total: 16.5s	remaining: 0us
bestTest = 0.7745026413
bestIteration = 480
Shrink model to first 481 iterations.
Best F1 Score for fold 1: 0.7572254335260116 at threshold 0.42
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6996691	test: 0.6806648	best: 0.6806648 (0)	total: 114ms	remaining: 57.1s
300:	learn: 0.7078367	test: 0.6814095	best: 0.6814095 (285)	total: 8.43s	remaining: 5.57s
499:	learn: 0.7110197	test: 0.6828987	best: 0.6852029 (410)	total: 15s	remaining: 0us
bestTest = 0.6852028774
bestIteration = 410
Shrink model to first 411 iterations.
Best F1 Score for fold 2: 0.6857142857142858 at threshold 0.31
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6955372	test: 0.6942790	best: 0.6942790 (0)	total: 48.9ms	remaining: 24.4s
300:	learn: 0.6987222	test: 0.7072890	best: 0.7080336 (210)	total: 7.45s	remaining: 4.93s
499:	learn: 0.7008622	test: 0.7095931	best: 0.7118973 (415)	total: 14.1s	remaining: 0us
bestTest = 0.7118972687
bestIteration = 415
Shrink model to first 416 iterations.
Best F1 Score for fold 3: 0.6997245179063362 at threshold 0.38
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6989162	test: 0.6942087	best: 0.6942087 (0)	total: 31.9ms	remaining: 15.9s
300:	learn: 0.7059072	test: 0.6980021	best: 0.6980021 (265)	total: 9.81s	remaining: 6.48s
499:	learn: 0.7094038	test: 0.6964426	best: 0.6980021 (265)	total: 14.9s	remaining: 0us
bestTest = 0.6980021356
bestIteration = 265
Shrink model to first 266 iterations.
Best F1 Score for fold 4: 0.6984126984126984 at threshold 0.32
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6965424	test: 0.7393363	best: 0.7393363 (0)	total: 34ms	remaining: 17s
300:	learn: 0.7016859	test: 0.7446892	best: 0.7446892 (175)	total: 10.2s	remaining: 6.74s
499:	learn: 0.7043422	test: 0.7400809	best: 0.7446892 (175)	total: 15.4s	remaining: 0us
bestTest = 0.7446892211
bestIteration = 175
Shrink model to first 176 iterations.
Best F1 Score for fold 5: 0.7044025157232705 at threshold 0.49
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.7015771	test: 0.7209734	best: 0.7209734 (0)	total: 31.9ms	remaining: 15.9s
300:	learn: 0.7082462	test: 0.7224626	best: 0.7352619 (5)	total: 9.06s	remaining: 5.99s
499:	learn: 0.7110635	test: 0.7224626	best: 0.7352619 (5)	total: 16.3s	remaining: 0us
bestTest = 0.735261886
bestIteration = 5
Shrink model to first 6 iterations.
Best F1 Score for fold 6: 0.7055393586005831 at threshold 0.5
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6949853	test: 0.7484826	best: 0.7484826 (0)	total: 40.9ms	remaining: 20.4s
300:	learn: 0.7010943	test: 0.7469934	best: 0.7538356 (255)	total: 7.71s	remaining: 5.1s
499:	learn: 0.7020181	test: 0.7432000	best: 0.7538356 (255)	total: 14.9s	remaining: 0us
bestTest = 0.7538355625
bestIteration = 255
Shrink model to first 256 iterations.
Best F1 Score for fold 7: 0.7368421052631579 at threshold 0.32
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6979970	test: 0.7438743	best: 0.7438743 (0)	total: 33.5ms	remaining: 16.7s
300:	learn: 0.7044030	test: 0.7461082	best: 0.7499719 (175)	total: 9.73s	remaining: 6.43s
499:	learn: 0.7058534	test: 0.7461082	best: 0.7499719 (175)	total: 17s	remaining: 0us
bestTest = 0.7499719006
bestIteration = 175
Shrink model to first 176 iterations.
Best F1 Score for fold 8: 0.7175572519083968 at threshold 0.3
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6948161	test: 0.7240221	best: 0.7240221 (0)	total: 33.7ms	remaining: 16.8s
300:	learn: 0.7011613	test: 0.7209734	best: 0.7240221 (0)	total: 10.1s	remaining: 6.65s
499:	learn: 0.7022857	test: 0.7217180	best: 0.7240221 (0)	total: 16.9s	remaining: 0us
bestTest = 0.7240221423
bestIteration = 0
Shrink model to first 1 iterations.
Best F1 Score for fold 9: 0.68125 at threshold 0.5
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6922470	test: 0.6909070	best: 0.6909070 (0)	total: 36.3ms	remaining: 18.1s
300:	learn: 0.7039327	test: 0.7045212	best: 0.7045212 (255)	total: 7.74s	remaining: 5.11s
499:	learn: 0.7044175	test: 0.6953748	best: 0.7075700 (330)	total: 14.9s	remaining: 0us
bestTest = 0.7075699674
bestIteration = 330
Shrink model to first 331 iterations.
Best F1 Score for fold 10: 0.6896551724137931 at threshold 0.25
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.7011200	test: 0.6874824	best: 0.6874824 (0)	total: 31.7ms	remaining: 15.8s
300:	learn: 0.7034562	test: 0.6897760	best: 0.6897760 (145)	total: 9.83s	remaining: 6.5s
499:	learn: 0.7073219	test: 0.6890021	best: 0.6897760 (145)	total: 15.1s	remaining: 0us
bestTest = 0.6897759892
bestIteration = 145
Shrink model to first 146 iterations.
Best F1 Score for fold 11: 0.6746411483253588 at threshold 0.25
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6961538	test: 0.7203946	best: 0.7203946 (0)	total: 36.1ms	remaining: 18s
300:	learn: 0.7008984	test: 0.7280492	best: 0.7280492 (300)	total: 9.85s	remaining: 6.51s
499:	learn: 0.7036357	test: 0.7303428	best: 0.7303428 (360)	total: 14.8s	remaining: 0us
bestTest = 0.7303427703
bestIteration = 360
Shrink model to first 361 iterations.
Best F1 Score for fold 12: 0.7071240105540898 at threshold 0.3
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6997786	test: 0.7069068	best: 0.7069068 (0)	total: 34ms	remaining: 17s
300:	learn: 0.7047569	test: 0.7046026	best: 0.7069068 (0)	total: 10.3s	remaining: 6.81s
499:	learn: 0.7082676	test: 0.7038393	best: 0.7069068 (0)	total: 17.4s	remaining: 0us
bestTest = 0.706906788
bestIteration = 0
Shrink model to first 1 iterations.
Best F1 Score for fold 13: 0.6513157894736842 at threshold 0.5
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6900408	test: 0.7077125	best: 0.7077125 (0)	total: 31.8ms	remaining: 15.9s
300:	learn: 0.7029631	test: 0.7353199	best: 0.7353199 (230)	total: 8.27s	remaining: 5.47s
499:	learn: 0.7032408	test: 0.7360832	best: 0.7383874 (365)	total: 15.6s	remaining: 0us
bestTest = 0.7383873795
bestIteration = 365
Shrink model to first 366 iterations.
Best F1 Score for fold 14: 0.7028753993610223 at threshold 0.44
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6979271	test: 0.7483249	best: 0.7483249 (0)	total: 35.6ms	remaining: 17.8s
300:	learn: 0.7031522	test: 0.7437166	best: 0.7483249 (0)	total: 10.2s	remaining: 6.78s
499:	learn: 0.7046839	test: 0.7437166	best: 0.7483249 (0)	total: 17.1s	remaining: 0us
bestTest = 0.7483248989
bestIteration = 0
Shrink model to first 1 iterations.
Best F1 Score for fold 15: 0.7070063694267515 at threshold 0.5
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6948797	test: 0.7598315	best: 0.7598315 (0)	total: 36.2ms	remaining: 18s
300:	learn: 0.7014997	test: 0.7751689	best: 0.7751689 (255)	total: 10s	remaining: 6.62s
499:	learn: 0.7037872	test: 0.7782364	best: 0.7782364 (320)	total: 17.3s	remaining: 0us
bestTest = 0.7782364084
bestIteration = 320
Shrink model to first 321 iterations.
Best F1 Score for fold 16: 0.7469135802469135 at threshold 0.46
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.7022757	test: 0.7114585	best: 0.7114585 (0)	total: 32.4ms	remaining: 16.2s
300:	learn: 0.7057355	test: 0.7007577	best: 0.7114585 (0)	total: 8.54s	remaining: 5.64s
499:	learn: 0.7064957	test: 0.7068927	best: 0.7114585 (0)	total: 15.6s	remaining: 0us
bestTest = 0.7114585395
bestIteration = 0
Shrink model to first 1 iterations.
Best F1 Score for fold 17: 0.6727272727272727 at threshold 0.5
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6917752	test: 0.7061576	best: 0.7061576 (0)	total: 34.8ms	remaining: 17.4s
300:	learn: 0.7006471	test: 0.7061293	best: 0.7207034 (10)	total: 10.4s	remaining: 6.85s
499:	learn: 0.7028223	test: 0.7053660	best: 0.7207034 (10)	total: 15.5s	remaining: 0us
bestTest = 0.7207034011
bestIteration = 10
Shrink model to first 11 iterations.
Best F1 Score for fold 18: 0.6940874035989717 at threshold 0.46
----------------


Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.7003384	test: 0.6800204	best: 0.6800204 (0)	total: 31.6ms	remaining: 15.8s
300:	learn: 0.7038954	test: 0.6861695	best: 0.6907636 (245)	total: 10.3s	remaining: 6.81s
499:	learn: 0.7062270	test: 0.6861695	best: 0.6907636 (245)	total: 15.8s	remaining: 0us
bestTest = 0.6907636199
bestIteration = 245
Shrink model to first 246 iterations.
Best F1 Score for fold 19: 0.6931818181818182 at threshold 0.4
----------------


In [68]:
#average_f1_score = np.mean(best_f1_scores)
#average_best_thresh = np.mean(optimal_thresholds)
#print(f"Average F1 Score across folds: {average_f1_score}")
#print(f"Average Optimal Threshold across folds: {average_best_thresh}")

# Predict on test data using the average optimal threshold
y_pred_test_prob = clf.predict_proba(df_test[feat_cols])[:, 1]
y_pred_test = (y_pred_test_prob > 0.5).astype(int)



In [69]:
prediction = pd.DataFrame({
    'id': df_test['id'],
    'target': y_pred_test
})

prediction.to_csv("predictions",index = False)
prediction.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
