# Adaboost

### Import dependancies

In [1]:
import os
import sys
import csv

import pandas as pd
#pd.set_option('display.max_rows', None)
# pd.options.display.float_format = '{:, .2f}'.format
pd.set_option('display.max_colwidth',500)
pd.set_option('display.max_columns', 100)

import numpy as np
from numpy import save, load
from numpy import savez_compressed
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
import copy
import pickle

#from scipy.misc import comb, logsumexp
from sklearn.manifold import TSNE #a tool to visualize high dimensional data
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD # dimensionality reduction using truncated SVD (AKA LSA)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import gutenberg
from nltk.collocations import *
import string #python module
import re # python regex module
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

np.random.seed(0)

from sklearn.preprocessing import normalize
from functools import reduce

Changed to using npz instead of pickle after finding npz average save/load times are shorter than pickle.

In [2]:
# load data

from numpy import load
target_y = load('model_target_data.npz')
target_y = target_y['arr_0']
target_y = np.ravel(target_y)
print(target_y.shape)

features_x =  load('model_data.npz')
features_x = features_x['arr_0']
print(features_x.shape)

(75385,)
(75385, 836)


In [5]:
print(os.path.getsize('model_target_data.npz'))
print(os.path.getsize('model_data.npz'))

25773
463657257


## Model 1

In [9]:
%%time
dtClf = DecisionTreeClassifier(max_depth = 1)
Ada_clf_1 = AdaBoostClassifier(base_estimator = dtClf, n_estimators = 100, learning_rate = 1.0 ) # n_estimators = number of weak learners/trees in the forest of trees
kfold = StratifiedKFold(n_splits=10)
Ada_clf_1_scores = cross_val_predict(Ada_clf_1, features_x, target_y, cv=kfold)

CPU times: user 1h 5min 55s, sys: 1min 30s, total: 1h 7min 25s
Wall time: 1h 7min 26s


In [12]:
## Score using provided metrics in scorer.py (provided in https://github.com/FakeNewsChallenge/fnc-1) on TEST set
from score import report_score, LABELS, score_submission

predicted = [LABELS[int(a)] for a in Ada_clf_1_scores]
actual = [LABELS[int(a)] for a in target_y]
fold_score, _ = score_submission(actual, predicted)
max_fold_score, _ = score_submission(actual, actual)
score = fold_score/max_fold_score

best_score = 0
best_fold = None
    
#print("Score for fold "+ str(fold) + " was - " + str(score))
if score > best_score:
    best_score = score
    best_fold = Ada_clf_1_scores

In [13]:
report_score(actual,predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   2269    |    814    |   2341    |    157    |
-------------------------------------------------------------
| disagree  |    399    |    527    |    575    |    36     |
-------------------------------------------------------------
|  discuss  |   1957    |    797    |   9971    |    648    |
-------------------------------------------------------------
| unrelated |    25     |    16     |    398    |   54455   |
-------------------------------------------------------------
Score: 28101.5 out of 34214.5	(82.1333060544506%)


82.1333060544506

## Model 2

In [8]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pipe_ab = Pipeline([('pca', PCA(n_components=27)),
            ('clf', AdaBoostClassifier(random_state = 123))])

# Set grid search params
adaboost_param_grid = {
    'clf__n_estimators': [150, 200, 250],
    'clf__learning_rate': [1.0, 0.5, 0.1]
}

# Construct grid search
gs_ab = GridSearchCV(estimator=pipe_ab,
            param_grid=adaboost_param_grid,
            scoring='accuracy',
            cv=10, verbose=2, return_train_score = True)

# Fit using grid search
gs_ab.fit(features_x, target_y)


Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................
[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................
[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................
[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................
[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................
[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................
[CV] .... clf__learning_rate=1.0, clf__n_estimators=150, total= 1.0min
[CV] clf__learning_rate=1.0, clf__n_estimators=150 ...................
[CV] .

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 129.1min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=27, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('clf',
                                        AdaBoostClassifier(algorithm='SAMME.R',
                                                           base_estimator=None,
                                                           learning_rate=1.0,
                                                           n_estimators=50,
                                                           random_state=123))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'clf__learning_rate': [

In [11]:
gs_ab.best_estimator_

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=27,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=0.5, n_estimators=200,
                                    random_state=123))],
         verbose=False)

In [20]:
print("GridSearchcv found best score of " + str(gs_ab.best_score_) + " with a learning rate of 0.5 and 200 trees")

GridSearchcv found best score of 0.8991577167438966 with a learning rate of 0.5 and 200 trees


## Model 3

max depth of 6, similar to xgboost model

In [3]:
%%time
dtClf_3 = DecisionTreeClassifier(max_depth = 6)
Ada_clf_3 = AdaBoostClassifier(base_estimator = dtClf_3, n_estimators = 100, learning_rate = 1.0 ) # n_estimators = number of weak learners/trees in the forest of trees
kfold = StratifiedKFold(n_splits=10)
Ada_clf_3_scores = cross_val_predict(Ada_clf_3, features_x, target_y, cv=kfold)

CPU times: user 7h 24min, sys: 1min 35s, total: 7h 25min 35s
Wall time: 7h 25min 40s


In [6]:
## Score using provided metrics in scorer.py (provided in https://github.com/FakeNewsChallenge/fnc-1) on TEST set
from score import report_score, LABELS, score_submission

predicted_3 = [LABELS[int(a)] for a in Ada_clf_3_scores]
actual_3 = [LABELS[int(a)] for a in target_y]
fold_score_3, _3 = score_submission(actual, predicted)
max_fold_score_3, _3 = score_submission(actual_3, actual_3)
score_3 = fold_score_3/max_fold_score_3

best_score_3 = 0
best_fold_3 = None
    
#print("Score for fold "+ str(fold) + " was - " + str(score))
if score_3 > best_score_3:
    best_score_3 = score_3
    best_fold_3 = Ada_clf_3_scores

In [7]:
report_score(actual_3,predicted_3)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   3728    |    218    |   1577    |    58     |
-------------------------------------------------------------
| disagree  |    547    |    565    |    395    |    30     |
-------------------------------------------------------------
|  discuss  |   1177    |    75     |   11915   |    206    |
-------------------------------------------------------------
| unrelated |    39     |     1     |    510    |   54344   |
-------------------------------------------------------------
Score: 30791.25 out of 34214.5	(89.99473907261542%)


89.99473907261542