In [1]:
import csv, logging, re, nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn import feature_extraction
import tqdm
import pandas as pd

class FeatureData(object):
    def __init__(self, article_file_path, stances_file_path):
        self.number_of_classes = 4
        self.classes = ['agree', 'disagree', 'discuss', 'unrelated']
        self.articles = self._get_articles(article_file_path)  # list of dictionaries
        self.stances = self._get_stances(stances_file_path)
        self.number_of_stances = len(self.stances)
        self.number_of_articles = len(self.articles)
        
        

    def get_clean_articles(self):
        clean_articles = []
        print('Retrieving clean articles...')

        for item in tqdm.tqdm(self.articles):
            cleaned_article = clean(item['articleBody'])
            tokens = tokenize_text(cleaned_article)
            no_stop_word_tokens = remove_stopwords(tokens)
            lemmatized_tokens = get_tokenized_lemmas(no_stop_word_tokens)
            clean_articles.append({'articleBody': ' '.join(lemmatized_tokens),
                                   'Body ID': item['Body ID']})
        return {article['Body ID']: article['articleBody'] for article in clean_articles}

    #We need the stop words for POS tagging to work propperly
    def get_original_articles(self):
        clean_articles = []
        print('Retrieving original articles...')
        for item in tqdm.tqdm(self.articles):
            #cleaned_article = clean(item['articleBody'])
            cleaned_article = item['articleBody'].encode('ascii', 'ignore')
            clean_articles.append({'articleBody':cleaned_article,
                                   'Body ID': item['Body ID']})
        return {article['Body ID']: article['articleBody'] for article in clean_articles}

    def get_clean_stances(self):
        clean_headlines = []
        print('Retrieving clean stances...')

        for item in tqdm.tqdm(self.stances):
            cleaned_headline = clean(item['Headline'])
            tokens = tokenize_text(cleaned_headline)
            no_stop_word_tokens = remove_stopwords(tokens)
            lemmatized_tokens = get_tokenized_lemmas(no_stop_word_tokens)
            clean_headlines.append({'Headline': ' '.join(lemmatized_tokens),
                                    'originalHeadline': cleaned_headline,
                                    'Body ID': item['Body ID'],
                                    'Stance': item['Stance']})

        return clean_headlines

    def _get_articles(self, path):
        # Body ID, articleBody
        articles = []
        dt=pd.read_csv(path)
        articles=dt.T.to_dict().values()
        return articles

    def _get_stances(self, path):
        # Headline, Body ID, Stance
        stances = []
        dt1=pd.read_csv(path)
        stances=dt1.T.to_dict().values()
        return stances


def normalize_word(w):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(w).lower()


def clean(text):
    return " ".join(re.findall(r'\w+', text, flags=re.UNICODE)).lower()


def tokenize_text(text):
    return [token for token in word_tokenize(text)]


def remove_stopwords(list_of_tokens):
    return [word for word in list_of_tokens if word not in feature_extraction.text.ENGLISH_STOP_WORDS]


def get_tokenized_lemmas(tokens):
    return [normalize_word(token) for token in tokens]


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from nltk.stem import PorterStemmer
from nltk import word_tokenize, pos_tag, ne_chunk, sent_tokenize
from gensim import models
from gensim.models.phrases import Phraser
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.chunk import tree2conlltags
import textacy as textacy
from textacy.doc import Doc
from textacy.extract import direct_quotations
import spacy
en=textacy.load_spacy('en_core_web_sm')



In [None]:
class FeatureGenerator(object):


    def __init__(self, clean_articles, clean_stances, original_articles, load_data=True):
        self._articles = clean_articles  # dictionary {article ID: body}
        self._original_articles = original_articles
        self._stances = clean_stances  # list of dictionaries
        self._max_ngram_size = 3
        self._refuting_words = ['fake', 'fraud', 'hoax', 'false', 'deny', 'denies', 'not', 'despite', 'nope', 'doubt',
                                'doubts', 'bogus', 'debunk', 'pranks', 'retract']
    def get_features_from_file(features_directory, use=[]):
        features = []
        feature_names = []
        for feature_csv in os.listdir(features_directory):
            for feature in use:
                if np.count_nonzero([feature_csv.startswith(feature)]):
                    ##print(feature)
                    with open(os.path.join(features_directory, feature_csv)) as f:
                        content = np.loadtxt(fname=f, delimiter=',', skiprows=1)
                        del_indices = []
                        i=0
                        if len(content.shape) == 1:
                            content = content.reshape(content.shape[0], 1)
                            feature_names.append(basename(feature) + str(0))
                        else:
                            for i in range(len(content)):
                                if i in use[feature]:
                                    feature_names.append(basename(feature) + str(i))
                                else:
                                    del_indices.append(i)
                            content = np.delete(content, del_indices, 1)
                       # print(feature_names)
                        features.append(content)
                       # print(features)
        test = np.concatenate(features, axis=1)
        return test, feature_names
        
    
    def get_features(self, features_directory="features"):
        feature_names = []
        features = []
        if True:
            print('Retrieving headline ngrams...')
            ngrams = np.array(self._get_ngrams())
            features.append(ngrams)
            ngram_headings = [('ngram_' + str(count)) for count in range(1, self._max_ngram_size + 1)]
            feature_names.append(ngram_headings)
            self._feature_to_csv(ngrams, ngram_headings,"C:/Users/binni/minor/"+features_directory+'/ngrams.csv')

        if True:
            print('Retrieving word2Vec...')
            word2Vec = np.array(self._get_word2vec()).reshape(len(self._stances), 1)
            features.append(word2Vec)
            feature_names.append("word2Vec")
            self._feature_to_csv(word2Vec, ["word2Vec"], "C:/Users/binni/minor/"+features_directory + '/word2Vec.csv')

        if True:
            print('Retrieving refuting words...')
            refuting = np.array(self._get_refuting_words())
            features.append(refuting)
            [feature_names.append(word + '_refuting') for word in self._refuting_words]
            self._feature_to_csv(refuting, self._refuting_words, "C:/Users/binni/minor/"+features_directory+'/refuting.csv')

        if True:
            print('Retrieving polarity...')
            polarity = np.array(self._polarity_feature())
            features.append(polarity)
            feature_names.append('headline_polarity')
            feature_names.append('article_polarity')
            self._feature_to_csv(polarity, ['headline_polarity', 'article_polarity'],"C:/Users/binni/minor/"+ features_directory+'/polarity.csv')
        if True:
            print('Retrieving named entity cosine...')
            named_cosine = np.array(self._named_entity_feature()).reshape(len(self._stances), 1)
            features.append(named_cosine)
            feature_names.append('named_cosine')
            self._feature_to_csv(named_cosine, ['named_cosine'],"C:/Users/binni/minor/"+ features_directory+'/named_cosine.csv')

        if True:
            print('Retrieving VADER...')
            vader = np.array(self._vader_feature()).reshape(len(self._stances), 2)
            features.append(vader)
            feature_names.append('vader_pos')
            feature_names.append('vader_neg')
            self._feature_to_csv(vader, ['vader'],"C:/Users/binni/minor/"+ features_directory+'/vader.csv')

        if True:
            print('Retrieving jaccard similarities...')
            jaccard = np.array(self._get_jaccard_similarity()).reshape(len(self._stances), 1)
            features.append(jaccard)
            feature_names.append('jaccard_similarity')
            self._feature_to_csv(jaccard, ['jaccard_similarity'],"C:/Users/binni/minor/"+ features_directory+'/jaccard_similarity.csv')

        if True:
            print('Retrieving quote analysis...')
            quotes = np.array(self._get_quotes()).reshape(len(self._stances), 1)
            features.append(quotes)
            feature_names.append('quote_analysis')
            self._feature_to_csv(quotes, ['quote_analysis'],"C:/Users/binni/minor/"+features_directory+'/quote_analysis.csv')

        if True:
            lengths = np.array(self._length_feature()).reshape(len(self._stances), 1)
            features.append(lengths)
            feature_names.append('lengths')
            self._feature_to_csv(lengths, ['lengths'], "C:/Users/binni/minor/"+features_directory + '/lengths.csv')

    def _feature_to_csv(self, feature, feature_headers, output_path):
        header = ','.join(feature_headers)
        np.savetxt(fname=output_path, X=feature, delimiter=',', header=header, comments='')

    def _get_ngrams(self):
        ngrams = []

        for stance in tqdm.tqdm(self._stances):
            # Retrieves the vocabulary of ngrams for the headline.
            stance_vectorizer = CountVectorizer(input=stance['Headline'], ngram_range=(1, self._max_ngram_size),
                                                binary=True)
            stance_vectorizer.fit_transform([stance['Headline']]).toarray()

            # Search the article text and count headline ngrams.
            vocab = stance_vectorizer.get_feature_names()
            vectorizer = CountVectorizer(input=self._articles[stance['Body ID']], vocabulary=vocab,
                                         ngram_range=(1, self._max_ngram_size))
            ngram_counts = vectorizer.fit_transform([self._articles[stance['Body ID']]]).toarray()
            features = vectorizer.get_feature_names()

            aggregated_counts = [0 for _ in range(self._max_ngram_size)]

            # Create a list of the aggregated counts of each ngram size.
            for index in np.nditer(np.nonzero(ngram_counts[0]), ['zerosize_ok']):
                aggregated_counts[len(features[index].split()) - 1] += ngram_counts[0][index]

            # attempt to standardize ngram counts across headlines and bodies of varying length by dividing total
            # ngram hits by the length of the headline. These will need to be normalized later so they lie
            # between 0 and 1.
            standardized_counts = [1.0*count/len(stance['Headline'].split()) for count in aggregated_counts]

            ngrams.append(standardized_counts)
            #print ngrams

        return ngrams

    def _get_word2vec(self):
        # Gather sentences
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        all_words = []; atricle_words = []

        for stance in tqdm.tqdm(self._stances):
            if stance['Stance'] == 'unrelated':
                pass
            body_words = []; headline_words = []
            headline = tokenizer.tokenize(stance['originalHeadline'])
            body = (tokenizer.tokenize(self._original_articles[stance['Body ID']].decode('utf-8')))[:4]
            for s in headline:
                s = word_tokenize(s)
                headline_words = headline_words + s
                all_words.append(s)
            for s in body:
                s = word_tokenize(s)
                body_words = body_words + s
                all_words.append(s)
            atricle_words.append([headline_words, body_words])

        # Train Word2Vec
        model = models.Word2Vec(all_words, size=100, min_count=1)

        cosine_similarities = []
        # Generate sentence vectors and computer cosine similarity
        for headline, body in atricle_words:
            h_vector = sum([model.wv[word] for word in headline])
            b_vector = sum([model.wv[word] for word in body])
            cosine_similarities.append(cosine_similarity(h_vector.reshape(1,-1), b_vector.reshape(1,-1)))

        return cosine_similarities

    def _get_refuting_words(self):
        """ Retrieves headlines of the articles and indicates a count of each of the refuting words in the headline.
        Returns a list containing the number of refuting words found (at lease once) in the headline. """

        features = []

        for stance in tqdm.tqdm(self._stances):
            # print "[DEBUG] stance ", stance
            count = [1 if refute_word in stance['Headline'] else 0 for refute_word in self._refuting_words]
            # print "[DEBUG] count ", count
            features.append(count)
        # print "[DEBUG] features", features
        return features

    def _polarity_feature(self):
        _refuting_words = ['fake', 'fraud', 'hoax', 'false', 'deny', 'denies', 'not',
                           'despite', 'nope', 'nowhere', 'doubt', 'doubts', 'bogus', 'debunk', 'pranks',
                           'retract', 'nothing', 'never', 'none', 'budge']

        def determine_polarity(text):
            tokens = tokenize_text(text)
            return sum([token in _refuting_words for token in tokens]) % 2

        polarities = []
        for stance in tqdm.tqdm(self._stances):
            headline_polarity = determine_polarity(stance['Headline'])
            body_polarity = determine_polarity(self._articles.get(stance['Body ID']))
            polarities.append([headline_polarity, body_polarity])

        return polarities

    def _named_entity_feature(self):
        """ Retrieves a list of Named Entities from the Headline and Body.
        Returns a list containing the cosine similarity between the counts of the named entities """
        stemmer = PorterStemmer()
        def get_tags(text):
            return pos_tag(word_tokenize(text))

        def filter_pos(named_tags, tag):
            return " ".join([stemmer.stem(name[0]) for name in named_tags if name[1].startswith(tag)])

        named_cosine = []
        tags = ["NN"]
        for stance in tqdm.tqdm(self._stances):
            stance_cosine = []
            head = get_tags(stance['originalHeadline'])
            body = get_tags(self._original_articles.get(stance['Body ID']).decode('utf-8')[:255])

            for tag in tags:
                head_f = filter_pos(head, tag)
                body_f = filter_pos(body, tag)

                if head_f and body_f:
                    vect = TfidfVectorizer(min_df=1)
                    tfidf = vect.fit_transform([head_f,body_f])
                    cosine = (tfidf * tfidf.T).todense().tolist()
                    if len(cosine) == 2:
                        stance_cosine.append(cosine[1][0])
                    else:
                        stance_cosine.append(0)
                else:
                    stance_cosine.append(0)
            named_cosine.append(stance_cosine)
        return named_cosine

    def _vader_feature(self):
        sid = SentimentIntensityAnalyzer()
        features = []

        for stance in tqdm.tqdm(self._stances):
            headVader = sid.polarity_scores(stance["Headline"])
            bodyVader = sid.polarity_scores(sent_tokenize(self._original_articles.get(stance['Body ID']).decode('utf-8'))[0])
            features.append(abs(headVader['pos']-bodyVader['pos']))
            features.append(abs(headVader['neg']-bodyVader['neg']))
        return features

    def _get_jaccard_similarity(self):
        """ Get the jaccard similarities for each headline and article body pair. Jaccard similarity is defined as
        J(A, B) = |A intersect B| / |A union B|. Try to normalize by only considering the first"""
        similarities = []
        for stance in tqdm.tqdm(self._stances):
            headline = set(stance['Headline'].split())
            body = set(self._articles.get(stance['Body ID']).split()[:255])
            jaccard = float(len(headline.intersection(body))) / len(headline.union(body))
            similarities.append(jaccard)

        return similarities

    def _get_quotes(self):
        quote_count = []
        for stance in tqdm.tqdm(self._stances):
            body = self._original_articles.get(stance['Body ID']).decode('utf-8', 'replace')
            doc = Doc(content=body,lang=en)
            quotes = direct_quotations(doc)
            quote_counter = 0

            for q in quotes:
                quote_counter = quote_counter + len(q[2])
            quote_counter = quote_counter / len(body)
            quote_count.append(quote_counter)

        return quote_count

    def _length_feature(self):
        lengths = []
        for stance in tqdm.tqdm(self._stances):
            lengths.append(len(self._original_articles.get(stance['Body ID'])))
        return lengths

    def _get_punctuation_frequency(self):
        frequencies = []

        for stance in tqdm.tqdm(self._stances):
            question_marks = 0
            exclamation_marks = 0
            article_body = self._original_articles[stance['Body ID']]

            for character in article_body:
                if character == '?':
                    question_marks += 1
                elif character == '!':
                    exclamation_marks += 1

            frequency = (question_marks + exclamation_marks) / len(article_body.split())
            frequencies.append(frequency)

        return frequencies


In [None]:
feature_data = FeatureData('C:\\Users\\binni\\minor\\data\\train_bodies.csv', 'C:\\Users\\binni\\minor\\data\\train_stances.csv')
feature_generator = FeatureGenerator(feature_data.get_clean_articles(), feature_data.get_clean_stances(), feature_data.get_original_articles())


Retrieving clean articles...


100%|██████████████████████████████████████| 1683/1683 [00:50<00:00, 33.16it/s]


Retrieving clean stances...


 34%|███████████▊                       | 16929/49972 [00:31<00:59, 556.17it/s]

In [None]:
features = feature_generator.get_features("feature_generator")

In [5]:
class Model(object):
    def __init__(self, modelType, features):
        self._stance_map = {'unrelated': 0, 'discuss': 1, 'agree': 2, 'disagree': 3}
        self._model_type = modelType
        self._features_for_X1 = features
        self._feature_col_names = []

    def get_data(self, body_file, stance_file, features_directory):
        feature_data = FeatureData(body_file, stance_file)
        X_train, self._feature_col_names = FeatureGenerator.get_features_from_file(use=self._features_for_X1,
                                                        features_directory=features_directory)

        y_train = np.asarray([self._stance_map[stance['Stance']] for stance in feature_data.stances])

        
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train = min_max_scaler.fit_transform(X_train)

        return {'X':X_train, 'y':y_train}
    def get_trained_classifier(self, X_train, y_train):
        
        if self._model_type == 'svm':
            classifier = svm.SVC(decision_function_shape='ovr', cache_size=1000)
        elif self._model_type == 'nn':
            classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30,), random_state=1)

        classifier.fit(X_train, y_train)
        return classifier
    def test_classifier(self, classifier, X_test):
        return classifier.predict(X_test)


In [64]:
X1_features = {
    #'refuting': [0,2,3,8,12,13],
    'ngrams': [0, 1, 2],
    #'polarity': [0],
    'named': [],
    #'vader': [0,1],
    'jaccard': [],
    'quote_analysis': [],
    'lengths': [],
    'punctuation_frequency': [],
    'word2Vec': []
}
X2_features = {
    #'refuting': [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],
    'ngrams': [1],
    'polarity': [1],
    #'named': [],
    #'vader': [0,1],
    #'jaccard': [],
    'quote_analysis': [],
    'lengths': [],
    'punctuation_frequency': [],
    #'word2Vec': []
}

model1_type = 'nn'
model2_type = 'nn'
doStratify = False
doKfold = False
numFolds = 10

In [7]:
def stratify(X, y):

  
    disagree_indices = np.where(y == 3)[0]
    agree_indices = np.where(y == 2)[0]
    discuss_indices = np.where(y == 1)[0]
    unrelated_indices = np.where(y == 0)[0]

    num_disagree = disagree_indices.shape[0]

   
    reduced_agree_indices = agree_indices[:len(agree_indices)]
    reduced_discuss_indices = discuss_indices[:len(discuss_indices)]
    reduced_unrelated_indices = unrelated_indices[:(num_disagree + len(agree_indices) + len(discuss_indices))]
    X_stratified = np.concatenate([X[disagree_indices], X[reduced_agree_indices], X[reduced_discuss_indices],
                                   X[reduced_unrelated_indices]], axis=0)
    y_stratified = np.concatenate([y[disagree_indices], y[reduced_agree_indices], y[reduced_discuss_indices],
                                   y[reduced_unrelated_indices]], axis=0)

    return {'X': X_stratified, 'y': y_stratified}

In [8]:
def map_stances(y):
    stance_map = {0: 'unrelated', 1: 'discuss', 2: 'agree', 3: 'disagree'}
    return [stance_map.get(key) for key in y]

In [48]:
from os.path import basename
from sklearn import svm, preprocessing
import os, re, string, tqdm, nltk
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold

In [10]:
model1 = Model(model1_type, X1_features)
model2 = Model(model2_type, X2_features)

In [11]:

train1 = model1.get_data('C:/Users/binni/minor/data/train_bodies.csv', 'C:/Users/binni/minor/data/train_stances.csv', 'features')
test1  = model1.get_data('C:/Users/binni/minor/data/competition_test_bodies.csv', 'C:/Users/binni/minor/data/competition_test_stances.csv', 'test_features')
train2 = model2.get_data('C:/Users/binni/minor/data/train_bodies.csv', 'C:/Users/binni/minor/data/train_stances.csv', 'features')
test2  = model2.get_data('C:/Users/binni/minor/data/competition_test_bodies.csv', 'C:/Users/binni/minor/data/competition_test_stances.csv', 'test_features')



In [12]:
def split_data(data1, data2, doStratify):
    X1 = data1['X']; X2 = data2['X']
    y1 = data1['y']; y2 = data2['y']

    if doStratify:
        stratified = stratify(X1, y1)
        X1 = stratified['X']
        y1 = stratified['y']
        X2 = stratified['X']
        y2 = stratified['y']

    return X1, y1, X2, y2


In [13]:
X1_train, y1_train, X1_test, y1_test = split_data(train1, test1, doStratify)
X2_train, y2_train, X2_test, y_test = split_data(train2, test2, doStratify)

In [14]:
y1_train1 = [int(s != 0) for s in y1_train]

In [16]:
y1_train

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [15]:
X2_train_filtered = X2_train[np.nonzero(y1_train1)]
y2_train_filtered = y2_train[np.nonzero(y1_train1)]

In [16]:
X2_train_filtered

array([[0.12240453, 0.00574713, 1.        , 0.        , 0.68661899],
       [0.08280093, 0.03448276, 0.        , 0.        , 0.        ],
       [0.09833745, 0.02955665, 0.        , 0.        , 0.        ],
       ...,
       [0.10022506, 0.06896552, 0.        , 0.06620763, 0.32223034],
       [0.12476405, 0.01253918, 0.        , 0.        , 0.05381519],
       [0.02334108, 0.03448276, 0.        , 0.27901786, 0.        ]])

In [17]:
clf1 = model1.get_trained_classifier(X1_train, y1_train)

In [18]:
clf2 = model2.get_trained_classifier(X2_train_filtered, y2_train_filtered)

In [19]:
y_predicted  = model1.test_classifier(clf1, X1_test)

In [20]:
y_predicted

array([1, 1, 1, ..., 0, 0, 0])

In [21]:
y2_predicted = model2.test_classifier(clf2, X2_test)

In [22]:
tmp_test = map_stances([int(s != 0) for s in y_test])
tmp_predicted = map_stances(y_predicted)

In [23]:
tmp_predicted

['discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'agree',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'di

In [24]:
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

In [25]:
def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))

In [26]:
def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
       # print(i,g,t)
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm

In [27]:
def report_score(actual, predicted):
    score, cm = score_submission(actual, predicted)
    ##print(score,cm)
    best_score, _ = score_submission(actual, actual)

    print_confusion_matrix(cm)
    print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(score*100/best_score) + "%)")
    return score*100/best_score


In [50]:
def precision(actual, predicted, stance_map):
    pairs = zip(actual, predicted)
    print("Precision")
    scores = {stance: None for stance in stance_map.items()}
    for stance, index in stance_map.items():
        truePositive = np.count_nonzero([x[1] == index for x in pairs if x[0] == index])
        falsePositive = np.count_nonzero([x[1] == index for x in pairs if x[0] != index])
        try:
            precision = 100 * float(truePositive) / (truePositive + falsePositive + 1)
            scores[stance] = precision
            #print(stance + ": " + str(precision))
        except ZeroDivisionError:
            pass
           # print("Zero")

    return scores

In [51]:
def recall(actual, predicted, stance_map):
    print("Recall")
    pairs = zip(actual, predicted)
    scores = {stance: None for stance in stance_map.items()}
    for stance, index in stance_map.items():
        truePositive = np.count_nonzero([x[1] == index for x in pairs if x[0] == index])
        falseNegative = np.count_nonzero([x[1] != index for x in pairs if x[0] == index])
        try:
            recall = 100 * float(truePositive) / (truePositive + falseNegative + 1)
            scores[stance] = recall
            #print(stance + ": " + str(recall))
        except ZeroDivisionError:
            pass
            #print("Zero")

    return scores


In [53]:
def accuracy(actual, predicted, stance_map):
    print("Accuracy")
    pairs = zip(actual, predicted)
    scores = {stance: None for stance in stance_map.items()}
    for stance, index in stance_map.items():
        accurate = np.count_nonzero([x[1] == index and x[1] == x[0] for x in pairs])
        total = np.count_nonzero([x[0] == index for x in pairs])
        try:
            accuracy = 100 * float(accurate)/total
            scores[stance] = accuracy
            #print(stance + ": " + str(accuracy))
        except ZeroDivisionError:
           # print("Zero"
            pass

    return scores

In [29]:
tmp_competition_score = report_score(tmp_test, tmp_predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
| disagree  |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
|  discuss  |    333    |     5     |   12562   |    527    |
-------------------------------------------------------------
| unrelated |     0     |     0     |    326    |   13101   |
-------------------------------------------------------------
Score: 15921.75 out of 16783.75	(94.86407983913011%)


In [41]:
for i, stance in enumerate(y_predicted):
    print(i,stance)

0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 1
28 0
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 1
37 1
38 1
39 1
40 1
41 1
42 1
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 1
51 1
52 1
53 1
54 1
55 1
56 1
57 1
58 0
59 0
60 1
61 1
62 1
63 1
64 1
65 1
66 1
67 1
68 1
69 1
70 0
71 1
72 1
73 1
74 1
75 1
76 1
77 0
78 1
79 1
80 1
81 1
82 1
83 1
84 1
85 1
86 1
87 1
88 1
89 1
90 1
91 1
92 1
93 1
94 1
95 1
96 1
97 1
98 0
99 1
100 1
101 0
102 1
103 1
104 1
105 1
106 1
107 1
108 1
109 1
110 1
111 1
112 1
113 1
114 1
115 1
116 1
117 1
118 1
119 1
120 1
121 1
122 1
123 1
124 1
125 1
126 1
127 1
128 1
129 1
130 1
131 1
132 1
133 1
134 1
135 1
136 1
137 1
138 1
139 1
140 1
141 1
142 1
143 1
144 1
145 1
146 1
147 1
148 1
149 1
150 1
151 1
152 1
153 1
154 1
155 1
156 1
157 1
158 1
159 1
160 1
161 1
162 1
163 1
164 1
165 1
166 1
167 1
168 1
169 1
170 1
171 1
172 1
173 1
174 1
175 1
176 1
177 1
178 1
179 1
180 0
181 1
182 1
183 0
184 1


KeyboardInterrupt: 

In [46]:
y2_predicted[73]

1

In [56]:
for i, stance in enumerate(y_predicted):
    if stance != 0:
        y_predicted[i] = y2_predicted[i]
precision(y_test, y_predicted, model1._stance_map)
recall(y_test, y_predicted, model1._stance_map)
accuracy(y_test, y_predicted, model1._stance_map)

Precision
Recall
Accuracy


{('agree', 2): None,
 ('disagree', 3): None,
 ('discuss', 1): None,
 ('unrelated', 0): None}

In [88]:
def plot_coefficients(classifier, feature_names, i, k):
    top_features=len(feature_names)/2
    coef = classifier.coefs_[0]

    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

   
    plt.figure(figsize=(30, 20))
    colors = ['#cccccc' if c < 0 else 'teal' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(0, 1 + 2 * top_features), feature_names[top_coefficients], rotation='70')
    plt.savefig("graphs/plot-NN_model" + str(i) + "_kfold" + str(k) + ".png")

In [46]:
def kfold_system(X1_features, X2_features, doStratify, numFolds, m1_type, m2_type):
    # init models
    model1 = Model(m1_type, X1_features)
    model2 = Model(m2_type, X2_features)

    # Get training and testing data
    data = model1.get_data('C:/Users/binni/minor/data/combined_bodies.csv', 'C:/Users/binni/minor/data/combined_stances.csv', 'combined_features')
    data2 = model2.get_data('C:/Users/binni/minor/data/combined_bodies.csv', 'C:/Users/binni/minor/data/combined_stances.csv', 'combined_features')

    X1, y1, X2, y2 = split_data(data, data2, doStratify)

    # For loop parameters
    kfold = StratifiedKFold(n_splits=numFolds)
    precision_scores = []; recall_scores = []; 
    accuracy_scores = []; competition_scores = []
    k=0

    for train_indices, test_indices in kfold.split(X1, y1):
        X1_train = X1[train_indices]
        y1_train = [int(s != 0) for s in y1[train_indices]]
        X2_train = X2[train_indices]
        y2_train = y2[train_indices]

        
        X1_test = X1[test_indices]
        X2_test = X2[test_indices]
        y_test  = y2[test_indices]

        
        X2_train_filtered = X2_train[np.nonzero(y1_train)]
        y2_train_filtered = y2_train[np.nonzero(y1_train)]

        
        clf1 = model1.get_trained_classifier(X1_train, y1_train)

        
        clf2 = model2.get_trained_classifier(X2_train_filtered, y2_train_filtered)
       
        y_predicted = model1.test_classifier(clf1, X1_test)
       

        y2_predicted = model2.test_classifier(clf2, X2_test)
        

       
        for i, stance in enumerate(y_predicted):
            if stance != 0:
                y_predicted[i] = y2_predicted[i]

        

        precision_scores.append(precision(y_test, y_predicted, model1._stance_map))
        recall_scores.append(recall(y_test, y_predicted, model1._stance_map))
        accuracy_scores.append(accuracy(y_test, y_predicted, model1._stance_map))

        y_test= map_stances(y_test)
        y_predicted = map_stances(y_predicted)
        competition_score = report_score(y_test, y_predicted)
        competition_scores.append(competition_score)
        k+=1

    print('\nKfold precision averages: ', score_average(precision_scores, model1))
    print('Kfold recall averages: ', score_average(recall_scores, model1))
    print ('Kfold accuracy averages: ', score_average(accuracy_scores, model1))
    print ('competition score averages: ', sum(competition_scores) / len(competition_scores))




In [62]:
def score_average(scores, model1):
    
    score_sums = {stance: 0 for stance in model1._stance_map.items()}
    invalid_counts = {stance: 0 for stance in
                      model1._stance_map.items()}  # Count number of zero division errors and exclude from averages

    for result in scores:
        for stance in model1._stance_map.items():
            if result[stance] != None:
                score_sums[stance] += result[stance]
            else:
                invalid_counts[stance] += 1

    return {stance: score_sums[stance]/(len(scores) - invalid_counts[stance]) for stance in model1._stance_map.items()}

In [92]:
for stance, index in  model1._stance_map.items():
    print(stance,index)

unrelated 0
discuss 1
agree 2
disagree 3


In [None]:
kfold_system(X1_features, X2_features, doStratify, numFolds, model1_type, model2_type)