In [1]:
import csv, logging, re, nltk

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn import feature_extraction
import tqdm
import pandas as pd

class FeatureData(object):
    def __init__(self, article_file_path, stances_file_path):
        self.number_of_classes = 4
        self.classes = ['agree', 'disagree', 'discuss', 'unrelated']
        self.articles = self._get_articles(article_file_path)  # list of dictionaries
        self.stances = self._get_stances(stances_file_path)
        self.number_of_stances = len(self.stances)
        self.number_of_articles = len(self.articles)

    def get_clean_articles(self):
        """Returns a dictionary with Body ID's as keys and article bodies as values."""
        clean_articles = []
        print('Retrieving clean articles...')

        for item in tqdm.tqdm(self.articles):
            cleaned_article = clean(item['articleBody'])
            tokens = tokenize_text(cleaned_article)
            no_stop_word_tokens = remove_stopwords(tokens)
            lemmatized_tokens = get_tokenized_lemmas(no_stop_word_tokens)
            clean_articles.append({'articleBody': ' '.join(lemmatized_tokens),
                                   'Body ID': item['Body ID']})
        return {article['Body ID']: article['articleBody'] for article in clean_articles}

    #We need the stop words for POS tagging to work propperly
    def get_original_articles(self):
        clean_articles = []
        print('Retrieving original articles...')
        for item in tqdm.tqdm(self.articles):
            #cleaned_article = clean(item['articleBody'])
            cleaned_article = item['articleBody'].encode('ascii', 'ignore')
            clean_articles.append({'articleBody':cleaned_article,
                                   'Body ID': item['Body ID']})
        return {article['Body ID']: article['articleBody'] for article in clean_articles}

    def get_clean_stances(self):
        """Retrieves a list of dictionaries containing the fully cleaned Headlines and the Body ID and Stance for
        each headline."""
        clean_headlines = []
        print('Retrieving clean stances...')

        for item in tqdm.tqdm(self.stances):
            cleaned_headline = clean(item['Headline'])
            tokens = tokenize_text(cleaned_headline)
            no_stop_word_tokens = remove_stopwords(tokens)
            lemmatized_tokens = get_tokenized_lemmas(no_stop_word_tokens)
            clean_headlines.append({'Headline': ' '.join(lemmatized_tokens),
                                    'originalHeadline': cleaned_headline,
                                    'Body ID': item['Body ID'],
                                    'Stance': item['Stance']})

        return clean_headlines

    def _get_articles(self, path):
        # Body ID, articleBody
        articles = []
        dt=pd.read_csv(path)
        articles=dt.T.to_dict().values()
        return articles

    def _get_stances(self, path):
        # Headline, Body ID, Stance
        stances = []
        dt1=pd.read_csv(path)
        stances=dt1.T.to_dict().values()
        return stances


def normalize_word(w):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(w).lower()


def clean(text):
    return " ".join(re.findall(r'\w+', text, flags=re.UNICODE)).lower()


def tokenize_text(text):
    return [token for token in word_tokenize(text)]


def remove_stopwords(list_of_tokens):
    return [word for word in list_of_tokens if word not in feature_extraction.text.ENGLISH_STOP_WORDS]


def get_tokenized_lemmas(tokens):
    return [normalize_word(token) for token in tokens]


In [2]:
feature_data = FeatureData('C:/Users/binni/Downloads/NLP-Fake-News-Challenge-master/NLP-Fake-News-Challenge-master/data/competition_test_bodies.csv', 'C:/Users/binni/Downloads/NLP-Fake-News-Challenge-master/NLP-Fake-News-Challenge-master/data/competition_test_stances.csv')

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from nltk.stem import PorterStemmer
from nltk import word_tokenize, pos_tag, ne_chunk, sent_tokenize

In [4]:
from gensim import models
from gensim.models.phrases import Phraser
from sklearn.metrics.pairwise import cosine_similarity



In [8]:
_max_ngram_size=3
_refuting_words = ['fake', 'fraud', 'hoax', 'false', 'deny', 'denies', 'not', 'despite', 'nope', 'doubt','doubts', 'bogus', 'debunk', 'pranks', 'retract']

In [5]:
_stances=feature_data.get_clean_stances()
_articles=feature_data.get_clean_articles()
_original_articles=feature_data.get_original_articles()


Retrieving clean stances...


100%|███████████████████████████████████| 25413/25413 [00:56<00:00, 446.66it/s]


Retrieving clean articles...


100%|████████████████████████████████████████| 904/904 [00:19<00:00, 18.96it/s]


Retrieving original articles...


100%|█████████████████████████████████████| 904/904 [00:00<00:00, 32284.48it/s]


In [None]:
ngrams = []
for stance in tqdm.tqdm(_stances):
    stance_vectorizer = CountVectorizer(input=stance['Headline'], ngram_range=(1, _max_ngram_size),binary=True)
    stance_vectorizer.fit_transform([stance['Headline']]).toarray()
    vocab = stance_vectorizer.get_feature_names()
    vectorizer = CountVectorizer(input=_articles[stance['Body ID']], vocabulary=vocab,ngram_range=(1, _max_ngram_size))
    ngram_counts = vectorizer.fit_transform([_articles[stance['Body ID']]]).toarray()
    features = vectorizer.get_feature_names()
    aggregated_counts = [0 for _ in range(_max_ngram_size)]
    for index in np.nditer(np.nonzero(ngram_counts[0]), ['zerosize_ok']):
        aggregated_counts[len(features[index].split()) - 1] += ngram_counts[0][index]
    standardized_counts = [1.0*count/len(stance['Headline'].split()) for count in aggregated_counts]
    ngrams.append(standardized_counts)

        

In [None]:
print(_original_articles[1])

In [None]:
feature_names=[]
if True:
    print('Retrieving headline ngrams...')
    ngrams = np.array(ngrams)
    features.append(ngrams)
    ngram_headings = [('ngram_' + str(count)) for count in range(1,_max_ngram_size + 1)]
    feature_names.append(ngram_headings)
    print(feature_names)
    print(ngram_headings)

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
all_words = []; atricle_words = []
for stance in tqdm.tqdm(_stances):
    if stance['Stance'] == 'unrelated':
        pass
    body_words = []; headline_words = []
    headline = tokenizer.tokenize(stance['originalHeadline'])
    body = (tokenizer.tokenize(_original_articles[stance['Body ID']].decode('utf-8')))[:4]
    for s in headline:
        s = word_tokenize(s)
        headline_words = headline_words + s
        all_words.append(s)
    for s in body:
        s = word_tokenize(s)
        body_words = body_words + s
        all_words.append(s)
    atricle_words.append([headline_words, body_words])

model = models.Word2Vec(all_words, size=100, min_count=1)
cosine_similarities = []
        # Generate sentence vectors and computer cosine similarity
for headline, body in atricle_words:
    h_vector = sum([model.wv[word] for word in headline])
    b_vector = sum([model.wv[word] for word in body])
    cosine_similarities.append(cosine_similarity(h_vector.reshape(1,-1), b_vector.reshape(1,-1)))

   


100%|████████████████████████████████████| 25413/25413 [05:08<00:00, 82.29it/s]


In [11]:
features = []
i=0
for stance in tqdm.tqdm(_stances):
    count = [1 if refute_word in stance['Headline'] else 0 for refute_word in _refuting_words]
    features.append(count)
    

100%|█████████████████████████████████| 25413/25413 [00:00<00:00, 55447.77it/s]


In [17]:
_refuting_words = ['fake', 'fraud', 'hoax', 'false', 'deny', 'denies', 'not','despite', 'nope', 'nowhere', 'doubt', 'doubts', 'bogus', 'debunk', 'pranks','retract', 'nothing', 'never', 'none', 'budge']

def determine_polarity(text):
    tokens = tokenize_text(text)
    return sum([token in _refuting_words for token in tokens]) % 2

polarities = []
for stance in tqdm.tqdm(_stances):
    headline_polarity = determine_polarity(stance['Headline'])

    body_polarity = determine_polarity(_articles.get(stance['Body ID']))

    polarities.append([headline_polarity, body_polarity])

100%|████████████████████████████████████| 25413/25413 [03:46<00:00, 53.81it/s]


In [19]:
stemmer = PorterStemmer()
def get_tags(text):
    return pos_tag(word_tokenize(text.encode('ascii', 'ignore')))
def filter_pos(named_tags, tag):
    return " ".join([stemmer.stem(name[0]) for name in named_tags if name[1].startswith(tag)])
named_cosine = []
tags = ["NN"]
for stance in tqdm.tqdm(_stances):
    print(stance)
    stance_cosine = []
    head = get_tags(stance['originalHeadline'])
    print(head)
    body = get_tags(_original_articles.get(stance['Body ID'])[:255])
    print(body)
    print("tags",tags)
    for tag in tags:
        head_f = filter_pos(head, tag)
        body_f = filter_pos(body, tag)
        print("hf",head_f)
        print("bf",body_f)
        if head_f and body_f:
            vect = TfidfVectorizer(min_df=1)
            tfidf = vect.fit_transform([head_f,body_f])
            print(tfidf)
            cosine = (tfidf * tfidf.T).todense().tolist()
            print(cosine)
            if len(cosine) == 2:
                stance_cosine.append(cosine[1][0])
            else:
                stance_cosine.append(0)
        else:
            stance_cosine.append(0)
        print(stance_cosine)
    named_cosine.append(stance_cosine)
    print(named_cosine)
    break

  0%|                                                | 0/25413 [00:00<?, ?it/s]

{'Headline': 'ferguson riot pregnant woman loses eye cop bean bag round car window', 'originalHeadline': 'ferguson riots pregnant woman loses eye after cops fire bean bag round through car window', 'Body ID': 2008, 'Stance': 'unrelated'}





NameError: name 'pos_tag' is not defined