In [28]:
from xgboost import XGBClassifier

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from itertools import product
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

RSEED=42

In [2]:
dtypes = {
    'country': 'category',
    'goal': 'float64',
    'blurb': 'str',
    'name': 'str',
    'main_category': 'category',
    'sub_category': 'category',
    'location_type': 'category',
    'duration': 'float64',
    'deadline_month': 'category',
    'deadline_day': 'category',
    'launched_at_month': 'category',
    'launched_at_day': 'category',
    'target': 'int64',
    'baseline': 'category',
}

In [3]:
# Read in data
df = pd.read_csv('data_clean.csv', dtype=dtypes)

In [3]:
df_text = pd.read_csv('./data_text_processed_ngram_2_7.csv')

In [4]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177522 entries, 0 to 177521
Columns: 115 entries, blurb to year old girl
dtypes: float64(101), int64(7), object(7)
memory usage: 155.8+ MB


In [9]:
df_text['target'] = df['target']

In [12]:
df.drop(['state', 'blurb', 'name', 'baseline'], axis=1, inplace=True)

In [13]:
# change dtypes to category for all categorical features
for col in df.columns:
    if df[col].dtype.name == 'object':
        df[col] = df[col].astype('category')

In [30]:
# remap with log function
df['goal'] = np.log(df['goal'])

In [31]:
X = df.drop(columns=['target'])
y = df['target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

In [32]:
xgb = XGBClassifier(random_state=RSEED, tree_method="hist", enable_categorical=True, use_label_encoder=False)
xgb.fit(X_train, y_train)

In [33]:
# make predictions for test data
y_pred = xgb.predict(X_test)
predictions = [round(value) for value in y_pred]


In [34]:
# evaluate predictions
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification Report:")
print(classification_report(y_test, predictions))   


Confusion Matrix:
[[13228  3466]
 [ 4751 14060]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76     16694
           1       0.80      0.75      0.77     18811

    accuracy                           0.77     35505
   macro avg       0.77      0.77      0.77     35505
weighted avg       0.77      0.77      0.77     35505



In [35]:
# f1 beta score
from sklearn.metrics import fbeta_score
print("F1 beta score:")
print(fbeta_score(y_test, predictions, beta=1.2))

F1 beta score:
0.7689631737595329


In [36]:
scorer = make_scorer(fbeta_score, beta=1.2)

grid_params = {
    'max_depth': [5, 8, 3],
    'learning_rate': [0.1, 0.01, 0.2],
    'gamma': [0, 0.25, 1.0],
    'reg_lambda': [0, 1.0, 10.0],
    'scale_pos_weight': [1, 3, 0.5]
}

gs = GridSearchCV(
    XGBClassifier(random_state=RSEED, tree_method="hist", enable_categorical=True, use_label_encoder=False),
    grid_params,
    scoring=scorer,
    verbose=1,
    cv=3,
    n_jobs=-1
)


In [37]:
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [38]:
gs_results.best_params_

{'gamma': 1.0,
 'learning_rate': 0.1,
 'max_depth': 8,
 'reg_lambda': 10.0,
 'scale_pos_weight': 3}

In [39]:
best_model = gs_results.best_estimator_

In [40]:
# make predictions for test data
y_pred = best_model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [49]:

X_test.copy().to_csv('test_data.csv', index=False)

In [48]:
misclassified = np.where(y_pred != y_test)[0]

misclassified_df = X_test.iloc[misclassified]

# add target column
misclassified_df['target'] = y_test.iloc[misclassified]

# save misclassified data
misclassified_df.to_csv('misclassified.csv', index=False)

In [41]:
# evaluate predictions
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification Report:")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[ 8305  8389]
 [ 1557 17254]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.50      0.63     16694
           1       0.67      0.92      0.78     18811

    accuracy                           0.72     35505
   macro avg       0.76      0.71      0.70     35505
weighted avg       0.75      0.72      0.71     35505



In [44]:
# f1 beta score
print("F1 beta score:") 
print(fbeta_score(y_test, predictions, beta=1.2))


F1 beta score:
0.798389708944519
