In [1]:
# For running in a google colaborator notebook instance
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os

from tqdm import tqdm
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_extraction import text
from sklearn.metrics import classification_report, f1_score

In [0]:
PROJECT_DIR = '/content/drive/My Drive/Colab Notebooks/COMP550'
DATA_FILEPATH = os.path.join(PROJECT_DIR, 'metadata_articles_dataframe.pkl')

def create_dataframe_for_training(data):
    """Creates a dataframe for training by concatenating claimant, claim and article content and
    copying labels to a new dataframe"""
    feature_column_name = 'X'
    data_cp = data[['label']].copy()
    for i, row in tqdm(data.iterrows(), total=len(data)):
        all_features = f'{row.claimant} {row.claim} {row.article_content}'
        data_cp.loc[i, feature_column_name] = all_features

    return data_cp

In [4]:
data = pd.read_pickle(DATA_FILEPATH)
data_for_training = create_dataframe_for_training(data)
data_for_training.head()

100%|██████████| 15555/15555 [01:04<00:00, 241.78it/s]


Unnamed: 0_level_0,label,X
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,A line from George Orwell's novel 1984 predic...
1,2,Maine legislature candidate Leslie Gibson ins...
4,1,A 17-year-old girl named Alyssa Carson is bei...
5,2,In 1988 author Roald Dahl penned an open lett...
6,2,Hillary Clinton When it comes to fighting terr...


In [0]:
content = data_for_training['X'].values
y = data_for_training['label'].values

feature_pipeline = Pipeline([
                             ('vect', text.TfidfVectorizer()), 
                             ('norm', preprocessing.Normalizer())
])
X = feature_pipeline.fit_transform(content)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, 
                                                    stratify=y)

In [0]:
# TODO fix for gpu
model = XGBClassifier(silent=False,
                      scale_pos_weight=1,
                      learning_rate=0.01,
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='multi:softmax',
                      n_estimators=1000,
                      reg_alpha = 0.3,
                      max_depth=4,
                      gamma=5,
                    #   tree_method='gpu_hist',
                    #   gpu_id=0,
                      num_class=3,
                      n_jobs=2,
                      random_state=42,
                      verbosity=1)

eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["auc","error"]

search_params = {
    'n_estimators': [100, 200],
    'max_depth': [3],
    # 'reg_alpha': [0.3, 0.6],
    'reg_alpha': [0.6],
    'gamma': [1]
}

In [16]:
grid_cv = GridSearchCV(model, search_params, cv=2, verbose=10, n_jobs=2)
%time grid_cv.fit(X_train, y_train)
print(grid_cv.best_params_)

# best reg_alpha = 0.6

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  5.9min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed: 11.9min
[Parallel(n_jobs=2)]: Done   6 out of   8 | elapsed: 23.4min remaining:  7.8min
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed: 34.7min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed: 34.7min finished


CPU times: user 18min 16s, sys: 1.7 s, total: 18min 18s
Wall time: 44min 10s
{'gamma': 1, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.6}


In [0]:
model.set_params(**grid_cv.best_params_)

In [0]:
%time model.fit(X_train, y_train, eval_metric=eval_metric, verbose=True)

CPU times: user 1h 28min 40s, sys: 2.07 s, total: 1h 28min 42s
Wall time: 45min 4s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=5,
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=-1,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=42, reg_alpha=0.3, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=False, subsample=0.8, verbosity=1)

In [17]:
preds = grid_cv.predict(X_test)
# preds = model.predict(X_test)
print(f'{classification_report(y_test, preds)}')
#{'gamma': 1, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.6}

              precision    recall  f1-score   support

           0       0.63      0.71      0.66      1112
           1       0.58      0.64      0.61       968
           2       1.00      0.01      0.02       254

    accuracy                           0.60      2334
   macro avg       0.73      0.45      0.43      2334
weighted avg       0.65      0.60      0.57      2334



In [0]:
# TODO fix for gpu
model = XGBClassifier(silent=False,
                      scale_pos_weight=1,
                      learning_rate=0.01,
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='multi:softmax',
                      n_estimators=500,
                      reg_alpha = 0.6,
                      max_depth=3,
                      gamma=51,
                    #   tree_method='gpu_hist',
                    #   gpu_id=0,
                      num_class=3,
                      n_jobs=2,
                      random_state=42,
                      verbosity=1)

In [19]:
%time model.fit(X_train, y_train, eval_metric=eval_metric, verbose=True)

CPU times: user 46min 40s, sys: 1.76 s, total: 46min 42s
Wall time: 24min


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=51,
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=2,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=42, reg_alpha=0.6, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=False, subsample=0.8, verbosity=1)

In [23]:
preds = model.predict(X_test)
f1_score(y_test, preds, average='weighted')

  'precision', 'predicted', average, warn_for)


0.5605096948832078