In [6]:
import csv, logging, re, nltk

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn import feature_extraction
import tqdm
import pandas as pd

class FeatureData(object):
    def __init__(self, article_file_path, stances_file_path):
        self.number_of_classes = 4
        self.classes = ['agree', 'disagree', 'discuss', 'unrelated']
        self.articles = self._get_articles(article_file_path)  # list of dictionaries
        self.stances = self._get_stances(stances_file_path)
        self.number_of_stances = len(self.stances)
        self.number_of_articles = len(self.articles)

    def get_clean_articles(self):
        """Returns a dictionary with Body ID's as keys and article bodies as values."""
        clean_articles = []
        print('Retrieving clean articles...')

        for item in tqdm.tqdm(self.articles):
            cleaned_article = clean(item['articleBody'])
            tokens = tokenize_text(cleaned_article)
            no_stop_word_tokens = remove_stopwords(tokens)
            lemmatized_tokens = get_tokenized_lemmas(no_stop_word_tokens)
            clean_articles.append({'articleBody': ' '.join(lemmatized_tokens),
                                   'Body ID': item['Body ID']})
        return {article['Body ID']: article['articleBody'] for article in clean_articles}

    #We need the stop words for POS tagging to work propperly
    def get_original_articles(self):
        clean_articles = []
        print('Retrieving original articles...')
        for item in tqdm.tqdm(self.articles):
            #cleaned_article = clean(item['articleBody'])
            cleaned_article = item['articleBody'].encode('ascii', 'ignore')
            clean_articles.append({'articleBody':cleaned_article,
                                   'Body ID': item['Body ID']})
        return {article['Body ID']: article['articleBody'] for article in clean_articles}

    def get_clean_stances(self):
        """Retrieves a list of dictionaries containing the fully cleaned Headlines and the Body ID and Stance for
        each headline."""
        clean_headlines = []
        print('Retrieving clean stances...')

        for item in tqdm.tqdm(self.stances):
            cleaned_headline = clean(item['Headline'])
            tokens = tokenize_text(cleaned_headline)
            no_stop_word_tokens = remove_stopwords(tokens)
            lemmatized_tokens = get_tokenized_lemmas(no_stop_word_tokens)
            clean_headlines.append({'Headline': ' '.join(lemmatized_tokens),
                                    'originalHeadline': cleaned_headline,
                                    'Body ID': item['Body ID'],
                                    'Stance': item['Stance']})

        return clean_headlines

    def _get_articles(self, path):
        # Body ID, articleBody
        articles = []
        dt=pd.read_csv(path)
        articles=dt.T.to_dict().values()
        return articles

    def _get_stances(self, path):
        # Headline, Body ID, Stance
        stances = []
        dt1=pd.read_csv(path)
        stances=dt1.T.to_dict().values()
        return stances


def normalize_word(w):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(w).lower()


def clean(text):
    return " ".join(re.findall(r'\w+', text, flags=re.UNICODE)).lower()


def tokenize_text(text):
    return [token for token in word_tokenize(text)]


def remove_stopwords(list_of_tokens):
    return [word for word in list_of_tokens if word not in feature_extraction.text.ENGLISH_STOP_WORDS]


def get_tokenized_lemmas(tokens):
    return [normalize_word(token) for token in tokens]


In [2]:
feature_data = FeatureData('C:/Users/binni/Downloads/NLP-Fake-News-Challenge-master/NLP-Fake-News-Challenge-master/data/competition_test_bodies.csv', 'C:/Users/binni/Downloads/NLP-Fake-News-Challenge-master/NLP-Fake-News-Challenge-master/data/competition_test_stances.csv')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from nltk.stem import PorterStemmer
from nltk import word_tokenize, pos_tag, ne_chunk, sent_tokenize

In [4]:
from gensim import models
from gensim.models.phrases import Phraser
from sklearn.metrics.pairwise import cosine_similarity



In [17]:
_max_ngram_size=3
_refuting_words = ['fake', 'fraud', 'hoax', 'false', 'deny', 'denies', 'not', 'despite', 'nope', 'doubt','doubts', 'bogus', 'debunk', 'pranks', 'retract']

In [5]:
_stances=feature_data.get_clean_stances()
_articles=feature_data.get_clean_articles()
_original_articles=feature_data.get_original_articles()


Retrieving clean stances...


100%|███████████████████████████████████| 25413/25413 [00:52<00:00, 488.70it/s]


Retrieving clean articles...


100%|████████████████████████████████████████| 904/904 [00:19<00:00, 46.41it/s]


Retrieving original articles...


100%|█████████████████████████████████████| 904/904 [00:00<00:00, 25111.43it/s]


In [33]:
ngrams = []
i=0
for stance in tqdm.tqdm(_stances):
    stance_vectorizer = CountVectorizer(input=stance['Headline'], ngram_range=(1, _max_ngram_size),binary=True)
    a=stance_vectorizer.fit_transform([stance['Headline']]).toarray()
    print ("Stance_vectorizer",a)
    vocab = stance_vectorizer.get_feature_names()
    print("vocab",vocab)
    vectorizer = CountVectorizer(input=_articles[stance['Body ID']], vocabulary=vocab,ngram_range=(1, _max_ngram_size))
    ngram_counts = vectorizer.fit_transform([_articles[stance['Body ID']]]).toarray()
    print("ngram_counts",ngram_counts)
    features = vectorizer.get_feature_names()
    print("features",features)
    aggregated_counts = [0 for _ in range(_max_ngram_size)]
    for index in np.nditer(np.nonzero(ngram_counts[0]), ['zerosize_ok']):
        aggregated_counts[len(features[index].split()) - 1] += ngram_counts[0][index]
    print("aggregated_counts",aggregated_counts)
    standardized_counts = [1.0*count/len(stance['Headline'].split()) for count in aggregated_counts]
    ngrams.append(standardized_counts)
    print("ngrams",ngrams)
    i=i+1
    if (i==4):
        break

        

  0%|                                                | 0/25413 [00:00<?, ?it/s]

Stance_vectorizer [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
vocab ['bag', 'bag round', 'bag round car', 'bean', 'bean bag', 'bean bag round', 'car', 'car window', 'cop', 'cop bean', 'cop bean bag', 'eye', 'eye cop', 'eye cop bean', 'ferguson', 'ferguson riot', 'ferguson riot pregnant', 'loses', 'loses eye', 'loses eye cop', 'pregnant', 'pregnant woman', 'pregnant woman loses', 'riot', 'riot pregnant', 'riot pregnant woman', 'round', 'round car', 'round car window', 'window', 'woman', 'woman loses', 'woman loses eye']
ngram_counts [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
features ['bag', 'bag round', 'bag round car', 'bean', 'bean bag', 'bean bag round', 'car', 'car window', 'cop', 'cop bean', 'cop bean bag', 'eye', 'eye cop', 'eye cop bean', 'ferguson', 'ferguson riot', 'ferguson riot pregnant', 'loses', 'loses eye', 'loses eye cop', 'pregnant', 'pregnant woman', 'pregnant woman loses', 'riot', 'riot pregnant', 'riot pregnant wom

  0%|                                        | 1/25413 [00:00<47:00,  9.01it/s]

Stance_vectorizer [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
vocab ['conservative', 'conservative sure', 'conservative sure gitmo', 'crazy', 'crazy conservative', 'crazy conservative sure', 'detainee', 'detainee killed', 'detainee killed james', 'foley', 'gitmo', 'gitmo detainee', 'gitmo detainee killed', 'james', 'james foley', 'killed', 'killed james', 'killed james foley', 'sure', 'sure gitmo', 'sure gitmo detainee']
ngram_counts [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
features ['conservative', 'conservative sure', 'conservative sure gitmo', 'crazy', 'crazy conservative', 'crazy conservative sure', 'detainee', 'detainee killed', 'detainee killed james', 'foley', 'gitmo', 'gitmo detainee', 'gitmo detainee killed', 'james', 'james foley', 'killed', 'killed james', 'killed james foley', 'sure', 'sure gitmo', 'sure gitmo detainee']
aggregated_counts [0, 0, 0]
ngrams [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]


  0%|                                        | 2/25413 [00:00<46:22,  9.13it/s]

Stance_vectorizer [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
vocab ['attack', 'bear', 'bear attack', 'bieber', 'bieber ringtone', 'bieber ringtone saved', 'guy', 'guy say', 'guy say justin', 'justin', 'justin bieber', 'justin bieber ringtone', 'ringtone', 'ringtone saved', 'ringtone saved bear', 'russian', 'russian guy', 'russian guy say', 'saved', 'saved bear', 'saved bear attack', 'say', 'say justin', 'say justin bieber']
ngram_counts [[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
features ['attack', 'bear', 'bear attack', 'bieber', 'bieber ringtone', 'bieber ringtone saved', 'guy', 'guy say', 'guy say justin', 'justin', 'justin bieber', 'justin bieber ringtone', 'ringtone', 'ringtone saved', 'ringtone saved bear', 'russian', 'russian guy', 'russian guy say', 'saved', 'saved bear', 'saved bear attack', 'say', 'say justin', 'say justin bieber']
aggregated_counts [1, 0, 0]
ngrams [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.1111111111111111, 0.0, 0.0]]
Stance_vectorizer [[1 1 1 




In [None]:
print(_original_articles[1])

In [None]:
feature_names=[]
if True:
    print('Retrieving headline ngrams...')
    ngrams = np.array(ngrams)
    features.append(ngrams)
    ngram_headings = [('ngram_' + str(count)) for count in range(1,_max_ngram_size + 1)]
    feature_names.append(ngram_headings)
    print(feature_names)
    print(ngram_headings)

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
all_words = []; atricle_words = []
for stance in tqdm.tqdm(_stances):
    if stance['Stance'] == 'unrelated':
        pass
    body_words = []; headline_words = []
    headline = tokenizer.tokenize(stance['originalHeadline'])
    body = (tokenizer.tokenize(_original_articles[stance['Body ID']].decode('utf-8')))[:4]
    for s in headline:
        s = word_tokenize(s)
        headline_words = headline_words + s
        all_words.append(s)
    for s in body:
        s = word_tokenize(s)
        body_words = body_words + s
        all_words.append(s)
    atricle_words.append([headline_words, body_words])

model = models.Word2Vec(all_words, size=100, min_count=1)
cosine_similarities = []
        # Generate sentence vectors and computer cosine similarity
for headline, body in atricle_words:
    h_vector = sum([model.wv[word] for word in headline])
    b_vector = sum([model.wv[word] for word in body])
    cosine_similarities.append(cosine_similarity(h_vector.reshape(1,-1), b_vector.reshape(1,-1)))

   


100%|████████████████████████████████████| 25413/25413 [05:08<00:00, 82.29it/s]


In [11]:
features = []
i=0
for stance in tqdm.tqdm(_stances):
    count = [1 if refute_word in stance['Headline'] else 0 for refute_word in _refuting_words]
    features.append(count)
    

100%|█████████████████████████████████| 25413/25413 [00:00<00:00, 55447.77it/s]


In [17]:
_refuting_words = ['fake', 'fraud', 'hoax', 'false', 'deny', 'denies', 'not','despite', 'nope', 'nowhere', 'doubt', 'doubts', 'bogus', 'debunk', 'pranks','retract', 'nothing', 'never', 'none', 'budge']

def determine_polarity(text):
    tokens = tokenize_text(text)
    return sum([token in _refuting_words for token in tokens]) % 2

polarities = []
for stance in tqdm.tqdm(_stances):
    headline_polarity = determine_polarity(stance['Headline'])

    body_polarity = determine_polarity(_articles.get(stance['Body ID']))

    polarities.append([headline_polarity, body_polarity])

100%|████████████████████████████████████| 25413/25413 [03:46<00:00, 53.81it/s]


In [15]:
stemmer = PorterStemmer()
i=0
def get_tags(text):
    return pos_tag(word_tokenize(text))
def filter_pos(named_tags, tag):
    return " ".join([stemmer.stem(name[0]) for name in named_tags if name[1].startswith(tag)])
named_cosine = []
tags = ["NN"]
for stance in tqdm.tqdm(_stances):
    #print(stance)
    stance_cosine = []
    head = get_tags(stance['originalHeadline'])
    #print(head)
    body = get_tags(_original_articles.get(stance['Body ID']).decode('utf-8')[:255])
    #print(body)
    print("tags",tags)
    for tag in tags:
        head_f = filter_pos(head, tag)
        body_f = filter_pos(body, tag)
        #print("hf",head_f)
        #print("bf",body_f)
        if head_f and body_f:
            vect = TfidfVectorizer(min_df=1)
            tfidf = vect.fit_transform([head_f,body_f])
            print(np.shape(tfidf))
            print("tfid",tfidf)
            #print("HELLO")
            #print("INVERSE",tfidf*tfidf.T)
            cosine = (tfidf * tfidf.T).todense().tolist()
            print("cosine",cosine)
            #print(len(cosine))
            if len(cosine) == 2:
                stance_cosine.append(cosine[1][0])
            else:
                stance_cosine.append(0)
        else:
            stance_cosine.append(0)
        print("stance_cosine",stance_cosine)
    named_cosine.append(stance_cosine)
    print("named_cosine",named_cosine)
    i=i+1
    if(i==10):
        break

  0%|                                                | 0/25413 [00:00<?, ?it/s]

tags ['NN']
(2, 25)
tfid   (0, 20)	0.3535533905932738
  (0, 23)	0.3535533905932738
  (0, 6)	0.3535533905932738
  (0, 5)	0.3535533905932738
  (0, 1)	0.3535533905932738
  (0, 21)	0.3535533905932738
  (0, 2)	0.3535533905932738
  (0, 22)	0.3535533905932738
  (1, 19)	0.20851441405707477
  (1, 18)	0.41702882811414954
  (1, 17)	0.41702882811414954
  (1, 3)	0.20851441405707477
  (1, 9)	0.20851441405707477
  (1, 15)	0.20851441405707477
  (1, 16)	0.20851441405707477
  (1, 13)	0.20851441405707477
  (1, 12)	0.20851441405707477
  (1, 11)	0.20851441405707477
  (1, 0)	0.20851441405707477
  (1, 24)	0.20851441405707477
  (1, 4)	0.20851441405707477
  (1, 10)	0.20851441405707477
  (1, 7)	0.20851441405707477
  (1, 8)	0.20851441405707477
  (1, 14)	0.20851441405707477
cosine [[1.0000000000000002, 0.0], [0.0, 0.9999999999999998]]
stance_cosine [0.0]
named_cosine [[0.0]]


  0%|                                        | 1/25413 [00:00<51:00,  8.30it/s]

tags ['NN']
(2, 15)
tfid   (0, 3)	0.5773502691896257
  (0, 5)	0.5773502691896257
  (0, 9)	0.5773502691896257
  (1, 4)	0.17149858514250882
  (1, 11)	0.17149858514250882
  (1, 12)	0.34299717028501764
  (1, 2)	0.17149858514250882
  (1, 13)	0.5144957554275265
  (1, 1)	0.5144957554275265
  (1, 8)	0.34299717028501764
  (1, 0)	0.17149858514250882
  (1, 10)	0.17149858514250882
  (1, 6)	0.17149858514250882
  (1, 14)	0.17149858514250882
  (1, 7)	0.17149858514250882
cosine [[1.0, 0.0], [0.0, 0.9999999999999999]]
stance_cosine [0.0]
named_cosine [[0.0], [0.0]]
tags ['NN']
(2, 17)
tfid   (0, 8)	0.5
  (0, 9)	0.5
  (0, 14)	0.5
  (0, 2)	0.5
  (1, 0)	0.21320071635561044
  (1, 12)	0.21320071635561044
  (1, 16)	0.4264014327112209
  (1, 15)	0.21320071635561044
  (1, 1)	0.21320071635561044
  (1, 13)	0.21320071635561044
  (1, 6)	0.4264014327112209
  (1, 5)	0.21320071635561044
  (1, 10)	0.21320071635561044
  (1, 7)	0.21320071635561044
  (1, 4)	0.21320071635561044
  (1, 11)	0.21320071635561044
  (1, 3)	0.4264

  0%|                                        | 3/25413 [00:00<48:54,  8.66it/s]

tags ['NN']
(2, 20)
tfid   (0, 19)	0.5
  (0, 1)	0.5
  (0, 9)	0.5
  (0, 7)	0.5
  (1, 5)	0.25000000000000006
  (1, 10)	0.25000000000000006
  (1, 13)	0.25000000000000006
  (1, 17)	0.25000000000000006
  (1, 6)	0.25000000000000006
  (1, 16)	0.25000000000000006
  (1, 11)	0.25000000000000006
  (1, 12)	0.25000000000000006
  (1, 3)	0.25000000000000006
  (1, 4)	0.25000000000000006
  (1, 14)	0.25000000000000006
  (1, 0)	0.25000000000000006
  (1, 15)	0.25000000000000006
  (1, 2)	0.25000000000000006
  (1, 8)	0.25000000000000006
  (1, 18)	0.25000000000000006
cosine [[1.0, 0.0], [0.0, 1.0000000000000002]]
stance_cosine [0.0]
named_cosine [[0.0], [0.0], [0.0], [0.0]]
tags ['NN']
(2, 24)
tfid   (0, 19)	0.5
  (0, 0)	0.5
  (0, 21)	0.5
  (0, 6)	0.5
  (1, 1)	0.39223227027636803
  (1, 16)	0.19611613513818402
  (1, 8)	0.19611613513818402
  (1, 17)	0.19611613513818402
  (1, 12)	0.19611613513818402
  (1, 11)	0.19611613513818402
  (1, 2)	0.19611613513818402
  (1, 5)	0.39223227027636803
  (1, 9)	0.19611613513818

  0%|                                        | 5/25413 [00:00<46:10,  9.17it/s]

tags ['NN']
(2, 17)
tfid   (0, 8)	0.5
  (0, 0)	0.5
  (0, 9)	0.5
  (0, 6)	0.5
  (1, 3)	0.27735009811261463
  (1, 12)	0.27735009811261463
  (1, 10)	0.27735009811261463
  (1, 2)	0.27735009811261463
  (1, 13)	0.27735009811261463
  (1, 1)	0.27735009811261463
  (1, 15)	0.27735009811261463
  (1, 16)	0.27735009811261463
  (1, 14)	0.27735009811261463
  (1, 4)	0.27735009811261463
  (1, 11)	0.27735009811261463
  (1, 5)	0.27735009811261463
  (1, 7)	0.27735009811261463
cosine [[1.0, 0.0], [0.0, 1.0000000000000007]]
stance_cosine [0.0]
named_cosine [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]
tags ['NN']
(2, 6)
tfid   (0, 0)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (0, 1)	0.5773502691896257
  (1, 4)	0.5773502691896257
  (1, 5)	0.5773502691896257
  (1, 3)	0.5773502691896257
cosine [[1.0, 0.0], [0.0, 1.0]]
stance_cosine [0.0]
named_cosine [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]


  0%|                                        | 7/25413 [00:00<43:45,  9.68it/s]

tags ['NN']
(2, 17)
tfid   (0, 16)	0.47107781233161794
  (0, 1)	0.33517574332792605
  (0, 15)	0.47107781233161794
  (0, 3)	0.47107781233161794
  (0, 5)	0.47107781233161794
  (1, 1)	0.20119467558491574
  (1, 6)	0.2827720964717407
  (1, 7)	0.2827720964717407
  (1, 9)	0.2827720964717407
  (1, 4)	0.2827720964717407
  (1, 10)	0.2827720964717407
  (1, 0)	0.2827720964717407
  (1, 14)	0.2827720964717407
  (1, 2)	0.2827720964717407
  (1, 12)	0.2827720964717407
  (1, 8)	0.2827720964717407
  (1, 13)	0.2827720964717407
  (1, 11)	0.2827720964717407
cosine [[1.0, 0.06743557494279506], [0.06743557494279506, 1.0000000000000007]]
stance_cosine [0.06743557494279506]
named_cosine [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.06743557494279506]]


  0%|                                        | 8/25413 [00:00<54:30,  7.77it/s]

tags ['NN']
(2, 14)
tfid   (0, 8)	0.5773502691896257
  (0, 10)	0.5773502691896257
  (0, 12)	0.5773502691896257
  (1, 9)	0.26726124191242445
  (1, 5)	0.5345224838248489
  (1, 1)	0.26726124191242445
  (1, 4)	0.26726124191242445
  (1, 6)	0.26726124191242445
  (1, 2)	0.26726124191242445
  (1, 7)	0.26726124191242445
  (1, 11)	0.26726124191242445
  (1, 13)	0.26726124191242445
  (1, 0)	0.26726124191242445
  (1, 3)	0.26726124191242445
cosine [[1.0, 0.0], [0.0, 1.0000000000000007]]
stance_cosine [0.0]
named_cosine [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.06743557494279506], [0.0]]


  0%|                                        | 9/25413 [00:00<50:51,  8.33it/s]

tags ['NN']
(2, 12)
tfid   (0, 11)	0.5773502691896257
  (0, 7)	0.5773502691896257
  (0, 0)	0.5773502691896257
  (1, 10)	0.23570226039551584
  (1, 1)	0.4714045207910317
  (1, 5)	0.23570226039551584
  (1, 9)	0.23570226039551584
  (1, 4)	0.23570226039551584
  (1, 3)	0.4714045207910317
  (1, 2)	0.4714045207910317
  (1, 6)	0.23570226039551584
  (1, 8)	0.23570226039551584
cosine [[1.0, 0.0], [0.0, 1.0]]
stance_cosine [0.0]
named_cosine [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.06743557494279506], [0.0], [0.0]]



