In [1]:
import numpy as np
import pandas as pd

In [2]:
pip install catboost

Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [3]:
from sklearn.model_selection import StratifiedKFold     #k-fold cross-validation
from sklearn.feature_extraction.text import TfidfVectorizer #feature extraction module; converts raw documents (text) into a matrix of TF-IDF features
from sklearn.decomposition import TruncatedSVD              #decomposition module; used for dimensionality reduction of sparse data
from catboost import CatBoostClassifier, Pool               #gradient boosting library by Yandex that handles categorical features naturally. Pool is used to hold data and can be passed as an argument for training a CatBoost model
from catboost.utils import eval_metric                  #evaluation metrics for CatBoost models, such as accuracy, AUC (Area Under the Curve)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [4]:
RAND_VAL=32
num_folds=20 ## Number of folds
n_est=500 ## Number of estimators

In [6]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [13]:
for feat in df_train.columns:
  df_train[feat] = df_train[feat].fillna('')
for feat in df_test.columns:
  df_test[feat] = df_test[feat].fillna('')
feat_cols=df_train.columns.drop(['target'])
X=df_train[feat_cols]
y=df_train['target']
cat_features = np.where(X.dtypes != np.float64)[0]
print(cat_features) #everything is categorical

[0 1 2 3]


In [16]:
folds = StratifiedKFold(n_splits=num_folds, random_state=RAND_VAL, shuffle=True)

#best_f1_scores = []
optimal_thresholds = []

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)

    clf = CatBoostClassifier(
        eval_metric='BalancedAccuracy',  # or 'Logloss', 'Accuracy'
        custom_metric='AUC',
        #task_type='GPU',
        learning_rate=0.02,
        iterations=n_est
    )

    clf.fit(train_pool, eval_set=val_pool, verbose=300)

    # Predict probabilities
    y_pred_val_prob = clf.predict_proba(X_val)[:, 1]

    # Optimize the threshold for the best F1 score
    #thresholds = np.linspace(0, 1, 101)
    #f1_scores = [f1_score(y_val, y_pred_val_prob > thresh) for thresh in thresholds]
    #best_thresh = thresholds[np.argmax(f1_scores)]
    #best_f1 = max(f1_scores)
    #best_f1_scores.append(best_f1)
    #optimal_thresholds.append(best_thresh)

    #print(f"Best F1 Score for fold {n_fold}: {best_f1} at threshold {best_thresh}")

    print("----------------")


0:	learn: 0.6994607	test: 0.7408958	best: 0.7408958 (0)	total: 10.7ms	remaining: 5.36s
300:	learn: 0.7027665	test: 0.7454339	best: 0.7507868 (118)	total: 3.21s	remaining: 2.12s
499:	learn: 0.7057029	test: 0.7492273	best: 0.7507868 (118)	total: 4.9s	remaining: 0us

bestTest = 0.7507867821
bestIteration = 118

Shrink model to first 119 iterations.
----------------
0:	learn: 0.6979742	test: 0.7630522	best: 0.7630522 (0)	total: 10.8ms	remaining: 5.37s
300:	learn: 0.7027536	test: 0.7805300	best: 0.7805300 (112)	total: 2.58s	remaining: 1.71s
499:	learn: 0.7056482	test: 0.7782258	best: 0.7805300 (112)	total: 5.03s	remaining: 0us

bestTest = 0.7805299539
bestIteration = 112

Shrink model to first 113 iterations.
----------------
0:	learn: 0.7020102	test: 0.6860880	best: 0.6860880 (0)	total: 18.8ms	remaining: 9.39s
300:	learn: 0.7082798	test: 0.6783607	best: 0.6981426 (1)	total: 3.16s	remaining: 2.09s
499:	learn: 0.7109737	test: 0.6852029	best: 0.6981426 (1)	total: 4.81s	remaining: 0us

bestTes

In [17]:
#average_f1_score = np.mean(best_f1_scores)
#average_best_thresh = np.mean(optimal_thresholds)
#print(f"Average F1 Score across folds: {average_f1_score}")
#print(f"Average Optimal Threshold across folds: {average_best_thresh}")

# Predict on test data using the average optimal threshold
y_pred_test_prob = clf.predict_proba(df_test[feat_cols])[:, 1]
y_pred_test = (y_pred_test_prob > 0.5).astype(int)



In [20]:
prediction = pd.DataFrame({
    'id': df_test['id'],
    'target': y_pred_test
})

prediction.to_csv("predictions",index = False)
prediction.head()
prediction['target'].value_counts()

target
0    1916
1    1347
Name: count, dtype: int64