## Feature Selection

In [135]:
import os
import re
import nltk
import numpy as np
import pandas as pd
from sklearn import feature_extraction
from tqdm import tqdm

In [78]:
_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def gen_or_load_feats(feat_fn, headlines, bodies, feature_file):
    if not os.path.isfile(feature_file):
        feats = feat_fn(headlines, bodies)
        np.save(feature_file, feats)

    return np.load(feature_file)

In [79]:
def word_overlap_features(headlines, bodies):
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        clean_headline = get_tokenized_lemmas(clean_headline)
        clean_body = get_tokenized_lemmas(clean_body)
        features = [
            len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))]
        X.append(features)
    return X

In [80]:

def refuting_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_headline = get_tokenized_lemmas(clean_headline)
        features = [1 if word in clean_headline else 0 for word in _refuting_words]
        X.append(features)
    return X

In [81]:
def polarity_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]

    def calculate_polarity(text):
        tokens = get_tokenized_lemmas(text)
        return sum([t in _refuting_words for t in tokens]) % 2
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        features = []
        features.append(calculate_polarity(clean_headline))
        features.append(calculate_polarity(clean_body))
        X.append(features)
    return np.array(X)

In [82]:

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output


def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

In [83]:

def append_chargrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
        if gram in text_body[:100]:
            grams_first_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    features.append(grams_first_hits)
    return features


def append_ngrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in ngrams(text_headline, size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    return features

In [84]:

def hand_features(headlines, bodies):

    def binary_co_occurence(headline, body):
        # Count how many times a token in the title
        # appears in the body text.
        bin_count = 0
        bin_count_early = 0
        for headline_token in clean(headline).split(" "):
            if headline_token in clean(body):
                bin_count += 1
            if headline_token in clean(body)[:255]:
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def binary_co_occurence_stops(headline, body):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count = 0
        bin_count_early = 0
        for headline_token in remove_stopwords(clean(headline).split(" ")):
            if headline_token in clean(body):
                bin_count += 1
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def count_grams(headline, body):
        # Count how many times an n-gram of the title
        # appears in the entire body, and intro paragraph

        clean_body = clean(body)
        clean_headline = clean(headline)
        features = []
        features = append_chargrams(features, clean_headline, clean_body, 2)
        features = append_chargrams(features, clean_headline, clean_body, 8)
        features = append_chargrams(features, clean_headline, clean_body, 4)
        features = append_chargrams(features, clean_headline, clean_body, 16)
        features = append_ngrams(features, clean_headline, clean_body, 2)
        features = append_ngrams(features, clean_headline, clean_body, 3)
        features = append_ngrams(features, clean_headline, clean_body, 4)
        features = append_ngrams(features, clean_headline, clean_body, 5)
        features = append_ngrams(features, clean_headline, clean_body, 6)
        return features

    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        X.append(binary_co_occurence(headline, body)
                 + binary_co_occurence_stops(headline, body)
                 + count_grams(headline, body))


    return X

## Fake News Challenge

In [85]:
import sys
import numpy as np
import random
import re
import argparse

from collections import defaultdict
from sklearn.ensemble import GradientBoostingClassifier
from csv import DictReader

#Import from other files
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
from utils.system import parse_params, check_version

### Reading Dataset

In [86]:
class DataSet():
    def __init__(self, name="train", path="fnc-1"):
        self.path = path

        print("Reading dataset")
        bodies = name+"_bodies.csv"
        stances = name+"_stances.csv"

        self.stances = self.read(stances)
        
        articles = self.read(bodies)
        self.articles = dict()

        #make the body ID an integer value
        for s in self.stances:
            s['Body ID'] = int(s['Body ID'])
        
        #copy all bodies into a dictionary
        for article in articles:
            self.articles[int(article['Body ID'])] = article['articleBody']

        print("Total stances: " + str(len(self.stances)))
        print("Total bodies: " + str(len(self.articles)))



    def read(self,filename):
        rows = []
        with open(self.path + "/" + filename, "rt", encoding='utf-8-sig') as table:
            r = DictReader(table)
            for line in r:
                rows.append(line)
        return rows

### Generate test splits 

In [87]:
def generate_hold_out_split (dataset, training = 0.8, base_dir="splits"):
    r = random.Random()
    r.seed(1489215)

    article_ids = list(dataset.articles.keys())  # get a list of article ids
    r.shuffle(article_ids)  # and shuffle that list


    training_ids = article_ids[:int(training * len(article_ids))]
    hold_out_ids = article_ids[int(training * len(article_ids)):]

    # write the split body ids out to files for future use
    with open(base_dir+ "/"+ "training_ids.txt", "w+") as f:
        f.write("\n".join([str(id) for id in training_ids]))

    with open(base_dir+ "/"+ "hold_out_ids.txt", "w+") as f:
        f.write("\n".join([str(id) for id in hold_out_ids]))



def read_ids(file,base):
    ids = []
    with open(base+"/"+file,"r") as f:
        for line in f:
           ids.append(int(line))
        return ids


def kfold_split(dataset, training = 0.8, n_folds = 10, base_dir="splits"):
    if not (os.path.exists(base_dir+ "/"+ "training_ids.txt")
            and os.path.exists(base_dir+ "/"+ "hold_out_ids.txt")):
        generate_hold_out_split(dataset,training,base_dir)

    training_ids = read_ids("training_ids.txt", base_dir)
    hold_out_ids = read_ids("hold_out_ids.txt", base_dir)

    folds = []
    for k in range(n_folds):
        folds.append(training_ids[int(k*len(training_ids)/n_folds):int((k+1)*len(training_ids)/n_folds)])

    return folds,hold_out_ids


def get_stances_for_folds(dataset,folds,hold_out):
    stances_folds = defaultdict(list)
    stances_hold_out = []
    for stance in dataset.stances:
        if stance['Body ID'] in hold_out:
            stances_hold_out.append(stance)
        else:
            fold_id = 0
            for fold in folds:
                if stance['Body ID'] in fold:
                    stances_folds[fold_id].append(stance)
                fold_id += 1

    return stances_folds,stances_hold_out

### Score

In [88]:
#Adapted from https://github.com/FakeNewsChallenge/fnc-1/blob/master/scorer.py
#Original credit - @bgalbraith

LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm


def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))


def report_score(actual,predicted):
    score,cm = score_submission(actual,predicted)
    best_score, _ = score_submission(actual,actual)

    print_confusion_matrix(cm)
    print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(score*100/best_score) + "%)")
    return score*100/best_score


if __name__ == "__main__":
    actual = [0,0,0,0,1,1,0,3,3]
    predicted = [0,0,0,0,1,1,2,3,3]

    report_score([LABELS[e] for e in actual],[LABELS[e] for e in predicted])

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     4     |     0     |     1     |     0     |
-------------------------------------------------------------
| disagree  |     0     |     2     |     0     |     0     |
-------------------------------------------------------------
|  discuss  |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
| unrelated |     0     |     0     |     0     |     2     |
-------------------------------------------------------------
Score: 6.75 out of 7.5	(90.0%)


### Systems

In [89]:
def parse_params():
    parser = argparse.ArgumentParser(description='FakeNewsChallenge fnc-1-baseline')
    parser.add_argument('-c', '--clean-cache', action='store_true', default=False, help="clean cache files")
#    params = parser.parse_args()
#    if not params.clean_cache:
#        return

    dr = "features"
    for f in os.listdir(dr):
        if re.search('\.npy$', f):
            fname = os.path.join(dr, f)
            os.remove(fname)
    for f in ['hold_out_ids.txt', 'training_ids.txt']:
        fname = os.path.join('splits', f)
        if os.path.isfile(fname):
            os.remove(fname)
    print("All clear")

def check_version():
    if sys.version_info.major < 3:
        sys.stderr.write('Please use Python version 3 and above\n')
        sys.exit(1)

#### Generate Features

In [90]:
def generate_features(stances,dataset,name):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y



In [101]:
if __name__ == "__main__":
    check_version()
    parse_params()


    #Load the training dataset and generate folds
    d = DataSet()
    folds,hold_out = kfold_split(d,n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)
    
        # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
    for fold in fold_stances:
        Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))


    best_score = 0
    best_fold = None
    
# # Classifier for each fold
# ## Uncomment this for loop code when you run this notebook for the first time. Takes over 30 mins to run.
#     for fold in fold_stances:
#         ids = list(range(len(folds)))
#         del ids[fold]

#         X_train = np.vstack(tuple([Xs[i] for i in ids]))
#         y_train = np.hstack(tuple([ys[i] for i in ids]))

#         X_test = Xs[fold]
#         y_test = ys[fold]

#         clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=False)
#         clf.fit(X_train, y_train)

#         predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
#         actual = [LABELS[int(a)] for a in y_test]

#         fold_score, _ = score_submission(actual, predicted)
#         max_fold_score, _ = score_submission(actual, actual)

#         score = fold_score/max_fold_score

#         print("Score for fold "+ str(fold) + " was - " + str(score))
#         if score > best_score:
#             best_score = score
#             best_fold = clf
#-----------------



    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    print("Scores on the dev set")
    report_score(actual,predicted)
    print("")
    print("")

    #Run on competition dataset
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
    actual = [LABELS[int(a)] for a in y_competition]

    print("Scores on the test set")
    report_score(actual,predicted)
            

All clear
Reading dataset
Total stances: 49972
Total bodies: 1683








Reading dataset
Total stances: 25413
Total bodies: 904


0it [00:00, ?it/s][A[A[A[A



35it [00:00, 338.25it/s][A[A[A[A



66it [00:00, 328.56it/s][A[A[A[A



104it [00:00, 342.11it/s][A[A[A[A



143it [00:00, 351.79it/s][A[A[A[A



178it [00:00, 349.37it/s][A[A[A[A



214it [00:00, 351.37it/s][A[A[A[A



246it [00:00, 323.48it/s][A[A[A[A



277it [00:00, 301.34it/s][A[A[A[A



306it [00:00, 267.75it/s][A[A[A[A



333it [00:01, 268.03it/s][A[A[A[A



370it [00:01, 289.79it/s][A[A[A[A



400it [00:01, 287.96it/s][A[A[A[A



441it [00:01, 315.96it/s][A[A[A[A



476it [00:01, 324.74it/s][A[A[A[A



510it [00:01, 322.81it/s][A[A[A[A



550it [00:01, 341.55it/s][A[A[A[A



585it [00:01, 334.72it/s][A[A[A[A



620it [00:01, 337.61it/s][A[A[A[A



656it [00:02, 338.21it/s][A[A[A[A



691it [00:02, 324.51it/s][A[A[A[A



730it [00:02, 341.60it/s][A[A[A[A



765it [00:02, 320.11it/s][A[A[A[A



800it [00:02, 326.68it/s][A[A[A[A



834it [00:02, 318.78it/s]

13272it [00:43, 382.40it/s][A[A[A[A



13311it [00:43, 366.97it/s][A[A[A[A



13349it [00:43, 361.39it/s][A[A[A[A



13386it [00:43, 349.45it/s][A[A[A[A



13422it [00:43, 343.02it/s][A[A[A[A



13457it [00:43, 343.10it/s][A[A[A[A



13497it [00:44, 354.98it/s][A[A[A[A



13537it [00:44, 366.26it/s][A[A[A[A



13574it [00:44, 361.84it/s][A[A[A[A



13611it [00:44, 348.35it/s][A[A[A[A



13647it [00:44, 338.12it/s][A[A[A[A



13682it [00:44, 317.73it/s][A[A[A[A



13715it [00:44, 316.11it/s][A[A[A[A



13749it [00:44, 322.02it/s][A[A[A[A



13782it [00:44, 308.72it/s][A[A[A[A



13818it [00:45, 321.71it/s][A[A[A[A



13851it [00:45, 310.22it/s][A[A[A[A



13883it [00:45, 304.25it/s][A[A[A[A



13918it [00:45, 315.98it/s][A[A[A[A



13950it [00:45, 307.40it/s][A[A[A[A



13988it [00:45, 325.35it/s][A[A[A[A



14021it [00:45, 318.32it/s][A[A[A[A



14054it [00:45, 293.32it/s][A[A[A[A



14089it [00

17038it [00:02, 6029.78it/s][A[A[A[A



17642it [00:02, 5979.68it/s][A[A[A[A



18242it [00:03, 5985.08it/s][A[A[A[A



18873it [00:03, 6078.87it/s][A[A[A[A



19482it [00:03, 6011.72it/s][A[A[A[A



20084it [00:03, 5926.99it/s][A[A[A[A



20678it [00:03, 5764.59it/s][A[A[A[A



21313it [00:03, 5926.96it/s][A[A[A[A



21908it [00:03, 5930.02it/s][A[A[A[A



22524it [00:03, 5996.00it/s][A[A[A[A



23125it [00:03, 5972.38it/s][A[A[A[A



23748it [00:03, 6047.35it/s][A[A[A[A



24354it [00:04, 6022.45it/s][A[A[A[A



24963it [00:04, 6041.75it/s][A[A[A[A



25413it [00:04, 6019.15it/s][A[A[A[A



0it [00:00, ?it/s][A[A[A[A



40it [00:00, 376.93it/s][A[A[A[A



76it [00:00, 369.69it/s][A[A[A[A



113it [00:00, 369.57it/s][A[A[A[A



147it [00:00, 358.52it/s][A[A[A[A



183it [00:00, 357.91it/s][A[A[A[A



225it [00:00, 372.04it/s][A[A[A[A



258it [00:00, 355.44it/s][A[A[A[A



291it [00:00, 332.59i

14137it [00:40, 331.24it/s][A[A[A[A



14174it [00:40, 339.21it/s][A[A[A[A



14209it [00:40, 340.06it/s][A[A[A[A



14245it [00:40, 344.72it/s][A[A[A[A



14280it [00:40, 339.95it/s][A[A[A[A



14316it [00:40, 345.69it/s][A[A[A[A



14352it [00:40, 349.05it/s][A[A[A[A



14388it [00:40, 334.42it/s][A[A[A[A



14422it [00:40, 327.75it/s][A[A[A[A



14455it [00:40, 313.37it/s][A[A[A[A



14495it [00:41, 332.13it/s][A[A[A[A



14529it [00:41, 332.95it/s][A[A[A[A



14565it [00:41, 339.73it/s][A[A[A[A



14600it [00:41, 323.07it/s][A[A[A[A



14635it [00:41, 314.50it/s][A[A[A[A



14667it [00:41, 315.22it/s][A[A[A[A



14702it [00:41, 323.22it/s][A[A[A[A



14735it [00:41, 307.81it/s][A[A[A[A



14767it [00:42, 277.97it/s][A[A[A[A



14802it [00:42, 295.76it/s][A[A[A[A



14833it [00:42, 293.58it/s][A[A[A[A



14863it [00:42, 284.64it/s][A[A[A[A



14894it [00:42, 289.16it/s][A[A[A[A



14926it [00

2103it [00:08, 228.40it/s][A[A[A[A



2131it [00:08, 238.61it/s][A[A[A[A



2157it [00:08, 230.38it/s][A[A[A[A



2185it [00:08, 242.10it/s][A[A[A[A



2212it [00:08, 249.14it/s][A[A[A[A



2238it [00:08, 241.24it/s][A[A[A[A



2269it [00:08, 257.51it/s][A[A[A[A



2296it [00:08, 258.40it/s][A[A[A[A



2329it [00:09, 275.58it/s][A[A[A[A



2358it [00:09, 241.82it/s][A[A[A[A



2387it [00:09, 253.45it/s][A[A[A[A



2426it [00:09, 282.23it/s][A[A[A[A



2459it [00:09, 294.04it/s][A[A[A[A



2490it [00:09, 287.22it/s][A[A[A[A



2526it [00:09, 305.24it/s][A[A[A[A



2558it [00:09, 269.71it/s][A[A[A[A



2588it [00:09, 275.37it/s][A[A[A[A



2617it [00:10, 263.69it/s][A[A[A[A



2645it [00:10, 262.05it/s][A[A[A[A



2672it [00:10, 250.69it/s][A[A[A[A



2698it [00:10, 237.24it/s][A[A[A[A



2723it [00:10, 235.91it/s][A[A[A[A



2749it [00:10, 239.60it/s][A[A[A[A



2777it [00:10, 246.04it/s][A[A[

13143it [00:51, 240.03it/s][A[A[A[A



13168it [00:51, 236.71it/s][A[A[A[A



13196it [00:51, 246.74it/s][A[A[A[A



13224it [00:51, 253.09it/s][A[A[A[A



13255it [00:51, 267.61it/s][A[A[A[A



13283it [00:51, 257.94it/s][A[A[A[A



13310it [00:51, 252.28it/s][A[A[A[A



13340it [00:51, 261.31it/s][A[A[A[A



13367it [00:51, 250.04it/s][A[A[A[A



13393it [00:52, 228.54it/s][A[A[A[A



13420it [00:52, 235.53it/s][A[A[A[A



13449it [00:52, 248.75it/s][A[A[A[A



13475it [00:52, 243.75it/s][A[A[A[A



13501it [00:52, 247.58it/s][A[A[A[A



13527it [00:52, 250.72it/s][A[A[A[A



13558it [00:52, 260.34it/s][A[A[A[A



13585it [00:52, 255.41it/s][A[A[A[A



13613it [00:52, 260.20it/s][A[A[A[A



13640it [00:53, 261.88it/s][A[A[A[A



13667it [00:53, 250.31it/s][A[A[A[A



13693it [00:53, 236.66it/s][A[A[A[A



13723it [00:53, 251.96it/s][A[A[A[A



13752it [00:53, 262.05it/s][A[A[A[A



13779it [00

23868it [01:33, 214.60it/s][A[A[A[A



23891it [01:33, 214.98it/s][A[A[A[A



23914it [01:33, 218.47it/s][A[A[A[A



23949it [01:33, 245.23it/s][A[A[A[A



23975it [01:33, 233.47it/s][A[A[A[A



24000it [01:33, 237.32it/s][A[A[A[A



24025it [01:33, 228.39it/s][A[A[A[A



24049it [01:34, 227.85it/s][A[A[A[A



24073it [01:34, 226.83it/s][A[A[A[A



24096it [01:34, 217.18it/s][A[A[A[A



24120it [01:34, 222.24it/s][A[A[A[A



24148it [01:34, 235.24it/s][A[A[A[A



24173it [01:34, 239.35it/s][A[A[A[A



24205it [01:34, 256.02it/s][A[A[A[A



24232it [01:34, 242.56it/s][A[A[A[A



24259it [01:34, 241.38it/s][A[A[A[A



24288it [01:35, 238.89it/s][A[A[A[A



24313it [01:35, 215.65it/s][A[A[A[A



24337it [01:35, 220.99it/s][A[A[A[A



24361it [01:35, 222.95it/s][A[A[A[A



24384it [01:35, 199.01it/s][A[A[A[A



24410it [01:35, 211.40it/s][A[A[A[A



24435it [01:35, 220.68it/s][A[A[A[A



24462it [01

764it [00:02, 322.55it/s][A[A[A[A



806it [00:02, 345.91it/s][A[A[A[A



842it [00:02, 336.41it/s][A[A[A[A



877it [00:02, 328.93it/s][A[A[A[A



911it [00:02, 318.03it/s][A[A[A[A



944it [00:02, 317.00it/s][A[A[A[A



976it [00:03, 309.47it/s][A[A[A[A



1008it [00:03, 302.03it/s][A[A[A[A



1039it [00:03, 301.19it/s][A[A[A[A



1073it [00:03, 311.84it/s][A[A[A[A



1105it [00:03, 307.14it/s][A[A[A[A



1137it [00:03, 309.53it/s][A[A[A[A



1169it [00:03, 305.81it/s][A[A[A[A



1203it [00:03, 314.12it/s][A[A[A[A



1235it [00:03, 310.07it/s][A[A[A[A



1270it [00:04, 319.91it/s][A[A[A[A



1306it [00:04, 328.80it/s][A[A[A[A



1345it [00:04, 342.13it/s][A[A[A[A



1381it [00:04, 344.05it/s][A[A[A[A



1416it [00:04, 329.23it/s][A[A[A[A



1451it [00:04, 334.17it/s][A[A[A[A



1485it [00:04, 324.00it/s][A[A[A[A



1518it [00:04, 317.06it/s][A[A[A[A



1550it [00:04, 303.22it/s][A[A[A[A




3480it [00:14, 221.08it/s][A[A[A[A



3503it [00:14, 219.91it/s][A[A[A[A



3527it [00:15, 224.57it/s][A[A[A[A



3552it [00:15, 231.46it/s][A[A[A[A



3576it [00:15, 232.56it/s][A[A[A[A



3600it [00:15, 229.35it/s][A[A[A[A



3630it [00:15, 245.20it/s][A[A[A[A



3655it [00:15, 245.56it/s][A[A[A[A



3680it [00:15, 246.17it/s][A[A[A[A



3708it [00:15, 253.73it/s][A[A[A[A



3734it [00:15, 252.83it/s][A[A[A[A



3761it [00:15, 257.45it/s][A[A[A[A



3792it [00:16, 270.87it/s][A[A[A[A



3820it [00:16, 263.71it/s][A[A[A[A



3847it [00:16, 244.03it/s][A[A[A[A



3872it [00:16, 243.57it/s][A[A[A[A



3901it [00:16, 255.19it/s][A[A[A[A



3927it [00:16, 236.68it/s][A[A[A[A



3952it [00:16, 234.59it/s][A[A[A[A



3982it [00:16, 250.07it/s][A[A[A[A



4009it [00:16, 252.41it/s][A[A[A[A



4035it [00:17, 236.61it/s][A[A[A[A



4061it [00:17, 238.14it/s][A[A[A[A



4088it [00:17, 245.83it/s][A[A[

536it [00:01, 255.91it/s][A[A[A[A



563it [00:01, 248.68it/s][A[A[A[A



589it [00:02, 248.20it/s][A[A[A[A



616it [00:02, 253.92it/s][A[A[A[A



650it [00:02, 273.52it/s][A[A[A[A



678it [00:02, 268.54it/s][A[A[A[A



706it [00:02, 267.51it/s][A[A[A[A



734it [00:02, 261.20it/s][A[A[A[A



761it [00:02, 253.63it/s][A[A[A[A



790it [00:02, 262.24it/s][A[A[A[A



817it [00:02, 251.56it/s][A[A[A[A



843it [00:03, 231.75it/s][A[A[A[A



869it [00:03, 239.00it/s][A[A[A[A



894it [00:03, 221.62it/s][A[A[A[A



919it [00:03, 226.89it/s][A[A[A[A



943it [00:03, 223.70it/s][A[A[A[A



966it [00:03, 224.28it/s][A[A[A[A



998it [00:03, 245.90it/s][A[A[A[A



1026it [00:03, 252.39it/s][A[A[A[A



1065it [00:03, 281.81it/s][A[A[A[A



1095it [00:04, 279.67it/s][A[A[A[A



1125it [00:04, 263.88it/s][A[A[A[A



1153it [00:04, 246.73it/s][A[A[A[A



1186it [00:04, 266.60it/s][A[A[A[A



1214it [00

3322it [00:11, 330.25it/s][A[A[A[A



3356it [00:11, 322.90it/s][A[A[A[A



3389it [00:11, 321.68it/s][A[A[A[A



3424it [00:11, 324.79it/s][A[A[A[A



3458it [00:11, 324.47it/s][A[A[A[A



3491it [00:11, 323.93it/s][A[A[A[A



3527it [00:11, 328.70it/s][A[A[A[A



3567it [00:12, 344.92it/s][A[A[A[A



3602it [00:12, 343.91it/s][A[A[A[A



3637it [00:12, 344.62it/s][A[A[A[A



3672it [00:12, 342.76it/s][A[A[A[A



3707it [00:12, 326.20it/s][A[A[A[A



3740it [00:12, 324.53it/s][A[A[A[A



3778it [00:12, 338.69it/s][A[A[A[A



3813it [00:12, 330.28it/s][A[A[A[A



3852it [00:12, 345.14it/s][A[A[A[A



3887it [00:13, 344.03it/s][A[A[A[A



3922it [00:13, 338.63it/s][A[A[A[A



3957it [00:13, 323.74it/s][A[A[A[A



3990it [00:13, 304.20it/s][A[A[A[A



4021it [00:13, 304.34it/s][A[A[A[A



4054it [00:13, 310.93it/s][A[A[A[A



4090it [00:13, 323.38it/s][A[A[A[A



4128it [00:13, 336.36it/s][A[A[

4231it [00:20, 241.40it/s][A[A[A[A



4256it [00:20, 227.09it/s][A[A[A[A



4280it [00:20, 224.13it/s][A[A[A[A



4305it [00:20, 229.30it/s][A[A[A[A



4330it [00:20, 232.19it/s][A[A[A[A



4355it [00:20, 237.18it/s][A[A[A[A



4379it [00:20, 232.45it/s][A[A[A[A



4403it [00:20, 223.62it/s][A[A[A[A



4426it [00:20, 217.89it/s][A[A[A[A



4451it [00:21, 225.69it/s][A[A[A[A



4474it [00:21, 214.74it/s][A[A[A[A



4497it [00:21, 218.70it/s][A[A[A[A



4525it [00:21, 229.83it/s][A[A[A[A



4549it [00:21, 218.26it/s][A[A[A[A



4577it [00:21, 233.23it/s][A[A[A[A



4601it [00:21, 233.87it/s][A[A[A[A



4625it [00:21, 225.16it/s][A[A[A[A



4650it [00:21, 229.64it/s][A[A[A[A



4663it [00:21, 212.33it/s][A[A[A[A



0it [00:00, ?it/s][A[A[A[A



33it [00:00, 326.74it/s][A[A[A[A



69it [00:00, 332.57it/s][A[A[A[A



100it [00:00, 325.04it/s][A[A[A[A



141it [00:00, 345.85it/s][A[A[A[A



174it 

915it [00:02, 297.23it/s][A[A[A[A



946it [00:02, 296.96it/s][A[A[A[A



977it [00:03, 297.16it/s][A[A[A[A



1013it [00:03, 313.12it/s][A[A[A[A



1050it [00:03, 326.78it/s][A[A[A[A



1085it [00:03, 331.46it/s][A[A[A[A



1119it [00:03, 330.54it/s][A[A[A[A



1155it [00:03, 333.24it/s][A[A[A[A



1191it [00:03, 340.70it/s][A[A[A[A



1226it [00:03, 341.46it/s][A[A[A[A



1261it [00:03, 329.49it/s][A[A[A[A



1296it [00:04, 333.46it/s][A[A[A[A



1330it [00:04, 333.18it/s][A[A[A[A



1366it [00:04, 340.20it/s][A[A[A[A



1401it [00:04, 308.44it/s][A[A[A[A



1433it [00:04, 307.29it/s][A[A[A[A



1469it [00:04, 319.75it/s][A[A[A[A



1502it [00:04, 313.73it/s][A[A[A[A



1535it [00:04, 317.46it/s][A[A[A[A



1572it [00:04, 327.70it/s][A[A[A[A



1607it [00:05, 326.43it/s][A[A[A[A



1640it [00:05, 316.62it/s][A[A[A[A



1678it [00:05, 332.00it/s][A[A[A[A



1712it [00:05, 315.79it/s][A[A[A[

237it [00:01, 229.32it/s][A[A[A[A



262it [00:01, 234.06it/s][A[A[A[A



286it [00:01, 218.58it/s][A[A[A[A



313it [00:01, 229.18it/s][A[A[A[A



337it [00:01, 231.84it/s][A[A[A[A



361it [00:01, 219.58it/s][A[A[A[A



387it [00:01, 230.14it/s][A[A[A[A



414it [00:01, 239.23it/s][A[A[A[A



439it [00:01, 228.82it/s][A[A[A[A



465it [00:02, 233.75it/s][A[A[A[A



489it [00:02, 234.89it/s][A[A[A[A



513it [00:02, 225.28it/s][A[A[A[A



538it [00:02, 231.46it/s][A[A[A[A



568it [00:02, 247.37it/s][A[A[A[A



594it [00:02, 191.41it/s][A[A[A[A



622it [00:02, 210.14it/s][A[A[A[A



646it [00:02, 213.08it/s][A[A[A[A



669it [00:02, 204.47it/s][A[A[A[A



696it [00:03, 218.82it/s][A[A[A[A



721it [00:03, 224.86it/s][A[A[A[A



745it [00:03, 188.43it/s][A[A[A[A



768it [00:03, 195.65it/s][A[A[A[A



789it [00:03, 187.09it/s][A[A[A[A



809it [00:03, 158.61it/s][A[A[A[A



827it [00:03, 14

594it [00:02, 229.67it/s][A[A[A[A



618it [00:02, 218.07it/s][A[A[A[A



641it [00:02, 219.56it/s][A[A[A[A



665it [00:02, 223.50it/s][A[A[A[A



692it [00:02, 230.52it/s][A[A[A[A



716it [00:03, 220.75it/s][A[A[A[A



739it [00:03, 214.19it/s][A[A[A[A



761it [00:03, 213.28it/s][A[A[A[A



783it [00:03, 210.85it/s][A[A[A[A



805it [00:03, 211.86it/s][A[A[A[A



830it [00:03, 219.16it/s][A[A[A[A



857it [00:03, 230.89it/s][A[A[A[A



886it [00:03, 244.99it/s][A[A[A[A



911it [00:03, 244.71it/s][A[A[A[A



936it [00:04, 236.06it/s][A[A[A[A



960it [00:04, 236.82it/s][A[A[A[A



984it [00:04, 204.09it/s][A[A[A[A



1008it [00:04, 212.31it/s][A[A[A[A



1033it [00:04, 220.19it/s][A[A[A[A



1056it [00:04, 218.47it/s][A[A[A[A



1080it [00:04, 223.79it/s][A[A[A[A



1103it [00:04, 218.38it/s][A[A[A[A



1126it [00:04, 219.36it/s][A[A[A[A



1149it [00:05, 214.75it/s][A[A[A[A



1176it [0

78it [00:00, 256.73it/s][A[A[A[A



107it [00:00, 265.67it/s][A[A[A[A



134it [00:00, 265.12it/s][A[A[A[A



158it [00:00, 255.55it/s][A[A[A[A



183it [00:00, 253.55it/s][A[A[A[A



207it [00:00, 243.92it/s][A[A[A[A



230it [00:00, 233.18it/s][A[A[A[A



253it [00:01, 225.00it/s][A[A[A[A



284it [00:01, 244.72it/s][A[A[A[A



311it [00:01, 250.93it/s][A[A[A[A



341it [00:01, 260.02it/s][A[A[A[A



371it [00:01, 270.31it/s][A[A[A[A



399it [00:01, 258.36it/s][A[A[A[A



426it [00:01, 239.32it/s][A[A[A[A



451it [00:01, 226.18it/s][A[A[A[A



480it [00:01, 241.74it/s][A[A[A[A



511it [00:02, 253.74it/s][A[A[A[A



537it [00:02, 253.32it/s][A[A[A[A



563it [00:02, 250.57it/s][A[A[A[A



589it [00:02, 246.60it/s][A[A[A[A



614it [00:02, 245.70it/s][A[A[A[A



645it [00:02, 261.66it/s][A[A[A[A



672it [00:02, 233.98it/s][A[A[A[A



698it [00:02, 236.61it/s][A[A[A[A



723it [00:02, 207

3199it [00:11, 303.34it/s][A[A[A[A



3234it [00:11, 314.37it/s][A[A[A[A



3266it [00:11, 306.99it/s][A[A[A[A



3303it [00:11, 320.84it/s][A[A[A[A



3336it [00:11, 318.29it/s][A[A[A[A



3372it [00:12, 322.78it/s][A[A[A[A



3406it [00:12, 327.02it/s][A[A[A[A



3445it [00:12, 343.15it/s][A[A[A[A



3480it [00:12, 319.71it/s][A[A[A[A



3513it [00:12, 298.71it/s][A[A[A[A



3544it [00:12, 299.19it/s][A[A[A[A



3579it [00:12, 312.74it/s][A[A[A[A



3611it [00:12, 301.23it/s][A[A[A[A



3642it [00:12, 293.58it/s][A[A[A[A



3682it [00:13, 318.68it/s][A[A[A[A



3715it [00:13, 311.28it/s][A[A[A[A



3752it [00:13, 325.54it/s][A[A[A[A



3786it [00:13, 313.84it/s][A[A[A[A



3825it [00:13, 333.21it/s][A[A[A[A



3848it [00:13, 283.09it/s][A[A[A[A



0it [00:00, ?it/s][A[A[A[A



25it [00:00, 247.67it/s][A[A[A[A



47it [00:00, 238.25it/s][A[A[A[A



66it [00:00, 219.70it/s][A[A[A[A



89it [

469it [00:01, 331.33it/s][A[A[A[A



503it [00:01, 317.10it/s][A[A[A[A



535it [00:01, 314.43it/s][A[A[A[A



569it [00:01, 320.10it/s][A[A[A[A



602it [00:01, 314.66it/s][A[A[A[A



634it [00:01, 302.37it/s][A[A[A[A



665it [00:02, 291.93it/s][A[A[A[A



696it [00:02, 296.98it/s][A[A[A[A



733it [00:02, 315.05it/s][A[A[A[A



767it [00:02, 320.27it/s][A[A[A[A



803it [00:02, 330.67it/s][A[A[A[A



837it [00:02, 321.69it/s][A[A[A[A



870it [00:02, 308.95it/s][A[A[A[A



902it [00:02, 289.84it/s][A[A[A[A



943it [00:02, 317.56it/s][A[A[A[A



985it [00:03, 340.53it/s][A[A[A[A



1022it [00:03, 348.58it/s][A[A[A[A



1058it [00:03, 344.63it/s][A[A[A[A



1094it [00:03, 334.14it/s][A[A[A[A



1128it [00:03, 322.54it/s][A[A[A[A



1161it [00:03, 311.73it/s][A[A[A[A



1199it [00:03, 328.30it/s][A[A[A[A



1233it [00:03, 324.72it/s][A[A[A[A



1266it [00:03, 313.12it/s][A[A[A[A



1305it [

4099it [00:16, 235.24it/s][A[A[A[A



4123it [00:16, 209.51it/s][A[A[A[A



4148it [00:16, 216.82it/s][A[A[A[A



4174it [00:16, 226.31it/s][A[A[A[A



4198it [00:16, 230.01it/s][A[A[A[A



4224it [00:16, 237.83it/s][A[A[A[A



4249it [00:16, 241.20it/s][A[A[A[A



4273it [00:16, 253.75it/s][A[A[A[A



0it [00:00, ?it/s][A[A[A[A



40it [00:00, 398.68it/s][A[A[A[A



70it [00:00, 362.40it/s][A[A[A[A



102it [00:00, 346.27it/s][A[A[A[A



134it [00:00, 336.77it/s][A[A[A[A



163it [00:00, 320.33it/s][A[A[A[A



196it [00:00, 320.24it/s][A[A[A[A



234it [00:00, 336.09it/s][A[A[A[A



266it [00:00, 329.42it/s][A[A[A[A



298it [00:00, 326.30it/s][A[A[A[A



330it [00:01, 302.81it/s][A[A[A[A



360it [00:01, 291.37it/s][A[A[A[A



393it [00:01, 301.95it/s][A[A[A[A



424it [00:01, 286.46it/s][A[A[A[A



453it [00:01, 280.89it/s][A[A[A[A



490it [00:01, 301.89it/s][A[A[A[A



521it [00:01, 301

3338it [00:13, 236.54it/s][A[A[A[A



3368it [00:13, 249.67it/s][A[A[A[A



3394it [00:14, 237.82it/s][A[A[A[A



3421it [00:14, 243.52it/s][A[A[A[A



3450it [00:14, 255.52it/s][A[A[A[A



3476it [00:14, 238.77it/s][A[A[A[A



3501it [00:14, 233.77it/s][A[A[A[A



3525it [00:14, 227.30it/s][A[A[A[A



3552it [00:14, 238.49it/s][A[A[A[A



3577it [00:14, 232.47it/s][A[A[A[A



3602it [00:14, 236.61it/s][A[A[A[A



3626it [00:15, 233.72it/s][A[A[A[A



3650it [00:15, 233.39it/s][A[A[A[A



3674it [00:15, 224.07it/s][A[A[A[A



3697it [00:15, 206.78it/s][A[A[A[A



3722it [00:15, 216.97it/s][A[A[A[A



3745it [00:15, 208.74it/s][A[A[A[A



3767it [00:15, 205.48it/s][A[A[A[A



3789it [00:15, 207.33it/s][A[A[A[A



3819it [00:15, 224.43it/s][A[A[A[A



3849it [00:16, 241.74it/s][A[A[A[A



3874it [00:16, 240.08it/s][A[A[A[A



3901it [00:16, 246.35it/s][A[A[A[A



3927it [00:16, 249.13it/s][A[A[

1973it [00:05, 317.97it/s][A[A[A[A



2007it [00:05, 323.11it/s][A[A[A[A



2042it [00:06, 326.24it/s][A[A[A[A



2075it [00:06, 312.65it/s][A[A[A[A



2107it [00:06, 304.94it/s][A[A[A[A



2141it [00:06, 307.53it/s][A[A[A[A



2174it [00:06, 313.82it/s][A[A[A[A



2206it [00:06, 314.44it/s][A[A[A[A



2238it [00:06, 312.05it/s][A[A[A[A



2275it [00:06, 327.12it/s][A[A[A[A



2314it [00:06, 342.97it/s][A[A[A[A



2349it [00:07, 328.87it/s][A[A[A[A



2383it [00:07, 314.62it/s][A[A[A[A



2421it [00:07, 330.75it/s][A[A[A[A



2457it [00:07, 337.42it/s][A[A[A[A



2492it [00:07, 303.48it/s][A[A[A[A



2532it [00:07, 326.65it/s][A[A[A[A



2566it [00:07, 319.36it/s][A[A[A[A



2601it [00:07, 326.30it/s][A[A[A[A



2636it [00:07, 323.86it/s][A[A[A[A



2669it [00:08, 311.23it/s][A[A[A[A



2710it [00:08, 326.76it/s][A[A[A[A



2748it [00:08, 340.85it/s][A[A[A[A



2786it [00:08, 350.68it/s][A[A[

Score for fold 6 was - 0.7740591783970123
Score for fold 0 was - 0.790634959548909
Score for fold 7 was - 0.8065337293169283
Score for fold 5 was - 0.7642120765832106
Score for fold 2 was - 0.8175341669089852
Score for fold 8 was - 0.820952380952381
Score for fold 9 was - 0.7873290538654758
Score for fold 3 was - 0.8108217514505465
Score for fold 1 was - 0.7939656376588909
Score for fold 4 was - 0.7953927600515095
Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    118    |     3     |    556    |    85     |
-------------------------------------------------------------
| disagree  |    14     |     3     |    130    |    15     |
-------------------------------------------------------------
|  discuss  |    58     |     5     |   1527    |    210    |
-------------------------------------------------------------
| 

In [268]:
def generate_fn_features(dataset,name):
    h, b = [],[]

    for d in dataset:
        h.append(d[0]) #title
        b.append(d[1]) #text

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X

In [287]:
df_Orig = pd.read_csv('fake_real_dataset.csv')
df = pd.read_csv('fake_real_dataset.csv')

In [288]:
def text_column(tuple1):
    if(pd.notna(tuple1[1])):
        if(tuple1[1].strip(' \t\n\r') == ''):
            return tuple1[0]
        else:
            return tuple1[1] 
    else:
            return tuple1[1]

In [289]:
df['text'] = df[['title', 'text']].apply(text_column, axis=1)

In [306]:
def title_column(tuple1):
    if(pd.notna(tuple1[0])):
        if(tuple1[0].strip(' \t\n\r') == ''):
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[1])
        else:
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[0])
    else:
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[0])

In [300]:
fn_dataset = df[['title', 'text']]
fn_dataset.shape

(100, 2)

In [301]:
fn_dataset = fn_dataset.values
len(fn_dataset)

100

In [302]:
X_fn = generate_fn_features(fn_dataset, "FakeNewsStance")

In [303]:
X_fn.shape

(10, 44)

In [276]:
   fn_predicted = [LABELS[int(a)] for a in best_fold.predict(X_fn)]

In [277]:
fn_predicted

['discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'discuss',
 'agree',
 'discuss',
 'discuss',
 'discuss']

In [265]:
#     # Load the Fake News Original dataset
# #    competition_dataset = DataSet("competition_test")
# #    X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")
    
# #    dataset = ["title","text"]
    
#     fn_dataset = df[['title', 'text']]
    
#     fn_dataset = fn_dataset.values
    
#     X_fn = generate_fn_features(fn_dataset, "fakenews")
    
#     # Predict on Fake News Original dataset
#     fn_predicted = [LABELS[int(a)] for a in best_fold.predict(X_fn)]