# Resources Used  
**For MG eyes**

- http://www.fakenewschallenge.org/
- https://github.com/Cisco-Talos/fnc-1
- https://tedboy.github.io/nlps/generated/word2vec.html
- https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html
- https://radimrehurek.com/gensim/models/keyedvectors.html
- https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
- https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#vector
- https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
- https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec  

- https://www.nltk.org/howto/sentiment.html


# Dependancies

pip install notebook --upgrade

pip install pip --upgrade

In [None]:
from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

In [None]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

In [1]:
import os
import sys

import pandas as pd
#pd.set_option('display.max_rows', None)
# pd.options.display.float_format = '{:, .2f}'.format
pd.set_option('display.max_colwidth',500)
pd.set_option('display.max_columns', 100)

import numpy as np
from numpy import save, load
from numpy import savez_compressed
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
import copy
import pickle

#from scipy.misc import comb, logsumexp
from sklearn.manifold import TSNE #a tool to visualize high dimensional data
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD # dimensionality reduction using truncated SVD (AKA LSA)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import gutenberg
from nltk.collocations import *
import string #python module
import re # python regex module
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

np.random.seed(0)

from sklearn.preprocessing import normalize
from functools import reduce

In [None]:
nltk.download('punkt') # a sentance tokenizer
nltk.download('gutenberg') # a text corpora and lexical resources
nltk.download('stopwords')

# A First Glance

In [None]:
os.listdir()

In [None]:
os.listdir('fnc-1')

In [None]:
# Alias csv file paths
Train_Bodies = 'fnc-1/train_bodies.csv'
Train_Stances = 'fnc-1/train_stances.csv'

Test_Bodies = 'fnc-1/competition_test_bodies.csv'
Test_Stances = 'fnc-1/competition_test_stances.csv'

In [None]:
# read in bodies to pandas
TrainBodies_df = pd.read_csv(Train_Bodies)
# rename column
TrainBodies_df.rename(columns = {'Body ID':'Body_ID'}, inplace=True)
# inspect df
print(TrainBodies_df.info())
print()
print("TrainBodies_df")
TrainBodies_df.head()

In [None]:
# check df for missing values
TrainBodies_df.isna().sum()

In [None]:
# read in Stances data
TrainStances_df = pd.read_csv(Train_Stances)
# rename column
TrainStances_df.rename(columns={'Body ID':'Body_ID'}, inplace=True)
# inspect df
print(TrainStances_df.info())
print()
#print(f"There are {TrainStances_df['Body_ID'].nunique()} unique Body_ID values in the Train Stances dataset")
print()
print("TrainStances_df")
TrainStances_df.head()

In [None]:
TrainStances_df.Stance.unique()

In [None]:
print("Number of unique Headlines: %s" % TrainStances_df.Headline.nunique())
print("Number of unique Body_IDs: %s" % TrainStances_df.Body_ID.nunique())

In [None]:
TrainStances_df.isna().sum()

In [None]:
TrainStances_df['Body_ID'].value_counts()


In [None]:
print("Body_ID min: %s" % TrainStances_df['Body_ID'].min())
print("Body_ID min: %s" %TrainStances_df['Body_ID'].max())

In [None]:
TrainStances_df.Stance.unique()

In [None]:
TestBodies_df = pd.read_csv(Test_Bodies)
TestBodies_df.rename(columns = {'Body ID':'Body_ID'}, inplace=True)
print(TestBodies_df.info())
print()
display(TestBodies_df.head(1))

In [None]:
TestStances_df = pd.read_csv(Test_Stances)
TestStances_df.rename(columns = {'Body ID':'Body_ID'}, inplace=True)
print(TestStances_df.info())
print()
display(TestStances_df.head(1))


In [None]:
# is there anything to note about distribution of body_ID? 
# instantiate a figure and axes object
fig, ax = plt.subplots()
x = TrainStances_df['Body_ID']
ax.hist(x, density=True )

## Summary

We are provided with three csv files, one of which is redundant.  
In the other two, we have Train_Bodies which is 1,683 unique article bodies and their associated ID number.  In the second file labeled Train_Stances, there are 49,972 total observations consisting of 1,648 unique Headlines' with the 1,683 unique Body_IDs'.  This makes sense given the first part of the challenge is to classify the Headline and Body as Related or Unrelated.

# Preprocessing

## Cleaning

 - punctuation
 - lowercase all
 - tokenize
 - remove stop words

In [None]:
# join TrainBodies_df and TrainStances_df

df_train = TrainStances_df.merge(TrainBodies_df, how = 'left', on = 'Body_ID', validate= 'm:1')
print(df_train.shape)
df_train.head(1)

In [None]:
# join TestBodies_df with TestStances_df
df_test = TestStances_df.merge(TestBodies_df, how = 'left', on = 'Body_ID', validate= 'm:1')
print(df_test.shape)
df_test.head(1)


In [None]:
# stack train and test sets
objs = [df_train, df_test]
data = pd.concat(objs, axis = 0, join='outer')
print(data.shape)
display(data.head())

In [None]:
# replace target lables with numeric target values

df_1 = copy.deepcopy(data)
df_1.Stance.replace({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3}, inplace=True)
df_1.head()

In [None]:
df_1.info()

In [None]:
# lowercase all text
df_2 = copy.deepcopy(df_1)
df_2['Headline'] = df_2['Headline'].str.lower()
df_2['articleBody'] = df_2['articleBody'].str.lower()

In [None]:
# remove punctuation and tokenize words
#tokenizer = RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer (r"(?u)\b\w\w+\b")
df_2['Headline_tokens'] = df_2['Headline'].map(tokenizer.tokenize)
df_2['articleBody_tokens'] = df_2['articleBody'].map(tokenizer.tokenize)
df_2.head(3)

##### might want to keep one or more punctuation values in another notebook iteration, eg, $

In [None]:
print(string.punctuation)

In [None]:
df_2.info()

In [None]:
stopwords_list = stopwords.words('english')
stopwords_list += ["''", '""', '...', '``',"_"]
stopwords_list

remove stopwords

In [None]:
df_2['Headline_tokens'] = df_2['Headline_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])
df_2['articleBody_tokens'] = df_2['articleBody_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])
df_2.head()

stem

In [None]:
# alias stemmer method
stemmer = nltk.stem.SnowballStemmer('english')
df_2['Headline_tokens'] = df_2.apply(lambda row: [stemmer.stem(item) for item in row.Headline_tokens], axis=1)
df_2['articleBody_tokens'] = df_2.apply(lambda row: [stemmer.stem(item) for item in row.articleBody_tokens], axis=1)

In [None]:
df_2.head()

# Basic Count Features

## generate grams and terms

In [None]:
df_2.info()

In [None]:
# https://github.com/Cisco-Talos/fnc-1/blob/master/tree_model/ngram.py

def getUnigram(words):
    #assert type(words) == []
    return words

def getBigram(words, join_string, skip=0):
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1, skip+2):
                if i + k < L:
                    lst.append(join_string.join([words[i], words[i+k]]))
        return lst
    else:
        # set it as unigram
        lst = getUnigram(words)
        return lst
                    
def getTrigram(words, join_string, skip=0):
    #assert type(words) == []
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1, skip+2):
                for k2 in range(1, skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append(join_string.join([words[i], words[i+k1], words[i+k1+k2]]))
        return lst
    else:
        #set as bigram
        lst = getBigram(words, join_string, skip)
        return lst
    
def getFourgram(words, join_string):

    #assert type(words) == list
    L = len(words)
    if L > 3:
        lst = []
        for i in xrange(L-3):
            lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) )
        return lst
    else:
        # set it as bigram
        lst = getTrigram(words, join_string)
    return lst



def getBiterm(words, join_string):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny', 'boy']
        Output: a list of biterm, e.g., ['I_am', 'I_Denny', 'I_boy', 'am_Denny', 'am_boy', 'Denny_boy']
        I use _ as join_string for this example.
    """
   # assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for j in range(i+1,L):
                lst.append( join_string.join([words[i], words[j]]) )
        return lst
    
    else:
        # set it as unigram
        lst = getUnigram(words)
    return lst
    
def getTriterm(words, join_string):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of triterm, e.g., ['I_am_Denny', 'I_Denny_am', 'am_I_Denny',
        'am_Denny_I', 'Denny_I_am', 'Denny_am_I']
        I use _ as join_string for this example.
    """
   # assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in xrange(L-2):
            for j in xrange(i+1,L-1):
                for k in xrange(j+1,L):
                    lst.append( join_string.join([words[i], words[j], words[k]]) )
        return lst
    else:
        # set it as biterm
        lst = getBiterm(words, join_string)
    return lst

In [None]:
# generate unigram
df_2["Headline_unigram"] = df_2["Headline_tokens"].map(lambda x: getUnigram(x))
df_2["articleBody_unigram"] = df_2["articleBody_tokens"].map(lambda x: getUnigram(x))

# generate bigram
join_str = "_"
df_2["Headline_bigram"] = df_2["Headline_unigram"].map(lambda x: getBigram(x, join_str))
df_2["articleBody_bigram"] = df_2["articleBody_unigram"].map(lambda x: getBigram(x, join_str))
        
# generate trigram
join_str = "_"
df_2["Headline_trigram"] = df_2["Headline_unigram"].map(lambda x: getTrigram(x, join_str))
df_2["articleBody_trigram"] = df_2["articleBody_unigram"].map(lambda x: getTrigram(x, join_str))

In [None]:
df_2.head(1)

In [None]:
# generate basic counting features

'''
def try_divide(x, y, val=0.0):
    """ 
        Try to divide two numbers
    """
    if y != 0.0:
        val = float(x) / y
    return val
'''
# calc percent of text corpus that is unique ( unique grams / ttl grams)

grams = ["unigram", "bigram", "trigram"]
feat_names = ["Headline", "articleBody"]

for feat_name in feat_names:
    for gram in grams:
        df_2["count_of_%s_%s" % (feat_name, gram)] = list(df_2.apply(lambda x: len(x[feat_name + "_" + gram]), axis=1))
        df_2["count_of_unique_%s_%s" % (feat_name, gram)] = \
              list(df_2.apply(lambda x: len(set(x[feat_name + "_" + gram])), axis=1))
        df_2["ratio_of_unique_%s_%s" % (feat_name, gram)] = \
            df_2["count_of_unique_%s_%s"%(feat_name,gram)] / df_2["count_of_%s_%s"%(feat_name,gram)]
            #map(try_divide, df_2["count_of_unique_%s_%s"%(feat_name,gram)], df_2["count_of_%s_%s"%(feat_name,gram)])

In [None]:
# overlapping n-grams count

for gram in grams:
    # find grams in each Headline n-gram that are also inside its coresponding articleBody n-gram
    df_2["count_of_Headline_%s_in_articleBody" % gram] = \
        list(df_2.apply(lambda x: sum([1. for w in x["Headline_" + gram] if w in set(x["articleBody_" + gram])]), axis=1))
    
    # return the ratio of overlapping grams to total grams
    df_2["ratio_of_Headline_%s_in_articleBody" % gram] = \
        df_2["count_of_Headline_%s_in_articleBody" % gram] / df_2["count_of_Headline_%s" % gram]
        #map(try_divide, df["count_of_Headline_%s_in_articleBody" % gram], df["count_of_Headline_%s" % gram])
        


In [None]:
df_2.head(1)

In [None]:
# number of sentences in headline and body
for feat_name in feat_names:
    df_2['len_sent_%s' % feat_name] = df_2[feat_name].apply(lambda x: len(sent_tokenize(x)))

In [None]:
feat_names_bcf = [ n for n in df_2.columns \
                if "count" in n \
                or "ratio" in n \
                or "len_sent" in n]

In [None]:
feat_names_bcf

In [None]:
pd.set_option('display.max_colwidth',50)
df_2.head(1)

In [None]:
# convert basic count features to numpy array

basic_count_feats = df

In [None]:
#train = data[~data['target'].isnull()]
#print ('train:')
#print (train[['Headline_unigram','Body ID', 'count_of_Headline_unigram']])
xBasicCountsTrain = df_2[feat_names].values
outfilename_bcf_train = "train.basic.pkl"
with open(outfilename_bcf_train, "wb") as outfile:
    pickle.dump(feat_names, outfile, -1)
    pickle.dump(xBasicCountsTrain, outfile, -1)
print ('basic counting features for training saved in %s' % outfilename_bcf_train)

# TF-IDF

In [None]:
df_2.shape

In [None]:
def cat_text(x):
    res = '%s %s' % (' '.join(x['Headline_unigram']), ' '.join(x['articleBody_unigram']))
    return res

In [None]:
# concatenate Headline and Body so we can fit a tfidf vectorizer that will learn the combined vocabulary

df_2['all_text'] = list(df_2.apply(cat_text, axis = 1))

In [None]:
df_2[0:1]

In [None]:
new_list = []
for row in df_2['Headline']:
    for item in row.split():
        new_list.append(item)
      

In [None]:
ttl_words = []
for row in df_2.all_text:
    for item in row.split():
        ttl_words.append(item)
len(ttl_words)

In [None]:
ttl_words = []
for row in df_2.all_text:
    for item in row.split():
        ttl_words.append(item)
set_ttl_words = set(ttl_words)
len(set_ttl_words)

In [None]:
df_2['all_text'].nunique()

In [None]:
# count number of words in all_text

#helper = copy.deepcopy(df_2['all_text'])
count = df_2['all_text'].str.split().apply(len).value_counts()
count.sort_index()


In [None]:
type(count)

In [None]:
df_2.shape

In [None]:
df_2.head(1)

In [None]:
# fit a TfidfVectorizer on the concatenated strings (fit learns the vocabulary and idf)

vec = TfidfVectorizer(ngram_range = (1, 3), max_df= 0.8, min_df= 2)
vec.fit(df_2['all_text'])
vocabulary = vec.vocabulary_

In [None]:


# fit and transform Headline using the learned vocabulary on the combined Headline + body corpus

vecH = TfidfVectorizer(ngram_range=(1,3), max_df=0.8, min_df= 2, vocabulary=vocabulary)
xHeadlineTfidf = vecH.fit_transform(df_2['Headline_unigram'].map(lambda x: ' '.join(x)))
print (xHeadlineTfidf.shape)


outfilename_htfidf_train = "MG-train.headline.tfidf.pkl"
with open (outfilename_htfidf_train, 'wb') as outfile:
    pickle.dump(xHeadlineTfidf, outfile, -1)





In [None]:
# fit and transform articleBody using the learned vocabulary on the combined Headline + body corpus

vecB = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=2, vocabulary=vocabulary)
xBodyTfidf = vecB.fit_transform(df_2['articleBody_unigram'].map(lambda x: ' '.join(x)))
print (xBodyTfidf.shape)

outfilename_btfidf_train = "MG-train.body.tfidf.pkl"
with open(outfilename_btfidf_train, "wb") as outfile:
    pickle.dump(xBodyTfidf, outfile, -1)
    


In [None]:
def cosine_sim(x, y):
    try:
        if type(x) is np.ndarray: x = x.reshape(1, -1)
        if type(y) is np.ndarray: y = y.reshape(1, -1)
        d = cosine_similarity(x, y)
        d = d[0][0]
    except:
        print (x)
        print (y)
        d = 0.
    return d

In [None]:
# calculate cosine similarity between Headline and articleBody

#load_xHeadlineTfidf = pickle.load(open("train.headline.tfidf.pkl", 'rb'))
#load_bodyTfidf = pickle.load(open("train.body.tfidf.pkl", 'rb'))

#simTfidf_train = cosine_similarity(xHeadlineTfidf, xBodyTfidf)
simTfidf_train = np.asarray(list(map(cosine_sim, xHeadlineTfidf, xBodyTfidf)))[:, np.newaxis]

print(simTfidf_train.shape)

outfilename_simtfidf_train = "MG-train.sim.tfidf.pkl"
with open(outfilename_simtfidf_train, "wb") as outfile:
    pickle.dump(simTfidf_train, outfile, -1)


# Latent Semantic Analysis

Applying Singular Value Decomposition (SVD) to the tf-idf features to reduce dimensionality and find latent topics.  
Take tf-idf features and apply SVD.  THen take cosine similarities between the SVD features of Headline and articleBody.  This similarity metric is very telling of whether the body and headline are related or not. 

In [None]:
type(xHeadlineTfidf)

In [None]:
print(xHeadlineTfidf.shape)
print(type(xHeadlineTfidf))
print()
print(xBodyTfidf.shape)
print(type(xBodyTfidf))

## SVD

In [None]:
from scipy.sparse import vstack
xHBTfidf = vstack((xHeadlineTfidf, xBodyTfidf)).toarray() # toarray() converts the csr_matrix objects to numpy arrays

In [None]:
type(xHBTfidf)

In [None]:
xHBTfidf.shape

In [None]:
svd = TruncatedSVD(n_components=100, n_iter=15, random_state = 42)

In [None]:
svd.fit(xHBTfidf) # fit to the combined train-test set (or the full training set for cv process)
print ('xHeadlineTfidf.shape:')
print (xHeadlineTfidf.shape)

xHeadlineSvd = svd.transform(xHeadlineTfidf)
print ('xHeadlineSvd.shape:')
print (xHeadlineSvd.shape)

xHeadlineSvdTrain = xHeadlineSvd
outfilename_hsvd_train = "train.headline.svd.pkl"
with open(outfilename_hsvd_train, "wb") as outfile:
    pickle.dump(xHeadlineSvdTrain, outfile, -1)

In [None]:
xBodySvd = svd.transform(xBodyTfidf)
print ('xBodySvd.shape:')
print (xBodySvd.shape)

xBodySvdTrain = xBodySvd
outfilename_bsvd_train = "train.body.svd.pkl"
with open(outfilename_bsvd_train, "wb") as outfile:
    pickle.dump(xBodySvdTrain, outfile, -1)

In [None]:
#sim_svd_train = cosine_similarity(xHeadlineSvd, xBodySvd)
simSvd_train = np.asarray(list(map(cosine_sim, xHeadlineSvd, xBodySvd)))[:, np.newaxis]
print ('sim_svd_train shape:')
print (simSvd_train.shape)

In [None]:
outfilename_simsvd_train = "train.sim.svd.pkl"
with open(outfilename_simsvd_train, "wb") as outfile:
    pickle.dump(simSvd_train, outfile, -1)

# Word2Vec

## using some talos code

In [None]:
pd.set_option('display.max_colwidth',100)

In [None]:
df_2['Headline_unigram_vec'] = df_2['Headline_tokens']
df_2['articleBody_unigram_vec'] = df_2['articleBody_tokens']

In [None]:
df_2.head(1)

In [None]:
import gensim
from gensim.models import Word2Vec



In [None]:
# load pre-trained model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
#import nibabel
#nibabel.load('GoogleNews-vectors-negative300.bin.gz').get_data()

In [None]:
Headline_unigram_array = df_2['Headline_unigram_vec'].values
print("df_2 Headline_unigram_vec type: %s" % type(df_2['Headline_unigram_vec']))
print("df_2 Headline_unigram_array type: %s" % type('Headline_unigram_array'))
print()

headlineVec = np.array(list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), Headline_unigram_array)))
headlineVec_norm = normalize(headlineVec)
print("headline vec type: %s" % type(headlineVec))
print("headline vec shape:" +  str(headlineVec.shape))
print()
print("headlineVec_norm vec type: %s" % type(headlineVec_norm))
print("headlineVec_norm vec shape:" + str(headlineVec_norm.shape))

In [None]:
headlineVecTrain = headlineVec_norm
outfilename_hvec_train = "train.headline.word2vec.pkl"
with open(outfilename_hvec_train, "wb") as outfile:
    pickle.dump(headlineVecTrain, outfile, -1)
print ('headline word2vec features of training set saved in %s' % outfilename_hvec_train)

In [None]:
Body_unigram_array = df_2['articleBody_unigram_vec'].values
print("df_2 articleBody_unigram_vec type: %s" % type(df_2['articleBody_unigram_vec']))
print("df_2 Body_unigram_array type: %s" % type('Body_unigram_array'))
print()

BodyVec = np.array(list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), Body_unigram_array)))
#bodyVec = np.array(bodyVec)
BodyVec_norm = normalize(BodyVec)

print("BodyVec type: %s" % type(BodyVec))
print("BodyVec shape:" +  str(BodyVec.shape))
print()
print("bodyVec_norm type: %s" % type(BodyVec_norm))
print("bodyVec_norm shape:" + str(BodyVec_norm.shape))

In [None]:
# save train dataset
bodyVecTrain = BodyVec_norm
outfilename_bvec_train = "train.body.word2vec.pkl"
with open(outfilename_bvec_train, "wb") as outfile:
    pickle.dump(bodyVecTrain, outfile, -1)
print ('body word2vec features of training set saved in %s' % outfilename_bvec_train)

In [None]:
# compute cosine similarity between headline/body word2vec features
simVec_w2v = np.asarray(list(map(cosine_sim, headlineVec_norm, BodyVec_norm)))[:, np.newaxis]
print(type(simVec_w2v))
print(simVec_w2v.shape)
print("simVec_w2v num dimensions:" + str(simVec_w2v.ndim))
print(simVec_w2v[0:2])

In [None]:
simVecTrain = simVec_w2v
outfilename_simvec_train = "train.sim.word2vec.pkl"
with open(outfilename_simvec_train, "wb") as outfile:
    pickle.dump(simVecTrain, outfile, -1)
print ('word2vec sim. features of training set saved in %s' % outfilename_simvec_train)

# Sentiment Features

- Use [NLTK Sentiment Analyzer](https://www.nltk.org/_modules/nltk/sentiment/vader.html) with [VADERSentiment](https://github.com/mgavish/vaderSentiment) to assign a sentiment polarity score to Headline and articelBody separately.
- negative score means a negative opinion.
- Do headline and articleBody have same sentiment?
- 

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')

In [None]:
# calculate polarity score of each sentance in a Headline observation and return the average

sid = SentimentIntensityAnalyzer() # https://www.nltk.org/howto/sentiment.html

def compute_sentiment(sentences):
    result = []
    for sentence in sentences:
        ss = sid.polarity_scores(sentence) # https://www.nltk.org/howto/sentiment.html
        result.append(ss)
    return pd.DataFrame(result).mean()

In [None]:
df_2['headline_sentmts'] = df_2['Headline'].apply(lambda x: sent_tokenize(x)) # nltk's method sent_tokenize()
df_2.head(1)

In [None]:
df_2 = pd.concat([df_2, df_2['headline_sentmts'].apply(lambda x: compute_sentiment(x))], axis=1)


In [None]:
df_2.head(1)

In [None]:
df_2.rename(columns={'compound':'h_compound', 'neg':'h_neg', 'neu':'h_neu', 'pos':'h_pos'}, inplace=True)
df_2.head(1)

In [None]:
headlineSenti = df_2[['h_compound','h_neg','h_neu','h_pos']].values
print ('headlineSenti.shape:' + str(headlineSenti.shape))

In [None]:
headlineSentiTrain = headlineSenti
outfilename_hsenti_train = "train.headline.senti.pkl"
with open(outfilename_hsenti_train, "wb") as outfile:
    pickle.dump(headlineSentiTrain, outfile, -1)
print ('headline sentiment features of training set saved in %s' % outfilename_hsenti_train)

In [None]:
df_2['body_sents'] = df_2['articleBody'].map(lambda x: sent_tokenize(x))
df_2 = pd.concat([df_2, df_2['body_sents'].apply(lambda x: compute_sentiment(x))], axis=1)
df_2.rename(columns={'compound':'b_compound', 'neg':'b_neg', 'neu':'b_neu', 'pos':'b_pos'}, inplace=True)
bodySenti = df_2[['b_compound','b_neg','b_neu','b_pos']].values
print ('bodySenti.shape:' + str(bodySenti.shape))

In [None]:
cols = list(df_2.columns)
cols.sort

In [None]:
bodySentiTrain = bodySenti
outfilename_bsenti_train = "train.body.senti.pkl"
with open(outfilename_bsenti_train, "wb") as outfile:
    pickle.dump(bodySentiTrain, outfile, -1)
print ('body sentiment features of training set saved in %s' % outfilename_bsenti_train)

In [None]:
pd.set_option('display.max_colwidth',100)
# df_2.to_csv('df_2_afterAllFeatureGeneration.csv')
df_2 = pd.read_csv('df_2_afterAllFeatureGeneration.csv')

In [None]:
df_2.head(1)

# XGBoost

In [22]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

In [None]:
# https://stackoverflow.com/questions/15463387/pickle-putting-more-than-1-object-in-a-file/15463472

# load features from pkl files

# basic count features
with open('train.basic.pkl', 'rb') as infile:
    feat_names = pickle.load(infile)
    xBasicCountsTrain = pickle.load(infile)
'''
# tfidf vectorized headline
with open('MG-train.headline.tfidf.pkl', 'rb') as tfidf_head:
    headline_tfidf = pickle.load(tfidf_head)
    headline_tfidf = headline_tfidf.toarray()
    
# tfidf vectorized body
with open('MG-train.body.tfidf.pkl', 'rb') as tfidf_in:
    body_tfidf = pickle.load(tfidf_in)
    body_tfidf = body_tfidf.toarray()
''' 
# cosine similarity between tfidf headline and body
with open('MG-train.sim.tfidf.pkl', 'rb') as tfidf_sim:
    sim_tfidf = pickle.load(tfidf_sim)

# svd of headline
with open('train.headline.svd.pkl', 'rb') as svd_head:
    headline_svd = pickle.load(svd_head)
    
# svd of body
with open('train.body.svd.pkl', 'rb') as svd_body:
    body_svd = pickle.load(svd_body)

# svd of tfidf cosine similarity
with open('train.sim.svd.pkl', 'rb') as svd_sim:
    sim_svd = pickle.load(svd_sim)
    
# w2v headline
with open('train.headline.word2vec.pkl', 'rb') as w2v_head:
    headline_w2v = pickle.load(w2v_head)
    
# w2v body
with open('train.body.word2vec.pkl', 'rb') as w2v_body:
    body_w2v = pickle.load(w2v_body)
    
# headlinen sentiment scores
with open('train.headline.senti.pkl', 'rb') as senti_head:
    headline_senti = pickle.load(senti_head)
    
# body sentiment scores
with open('train.body.senti.pkl', 'rb') as senti_body:
    body_senti = pickle.load(senti_body)

In [None]:
print('train.basic.pkl: ' + str(os.path.getsize('train.basic.pkl')))
print('MG-train.headline.tfidf.pkl: ' + str(os.path.getsize('MG-train.headline.tfidf.pkl')))
print('MG-train.body.tfidf.pkl: ' + str(os.path.getsize('MG-train.body.tfidf.pkl')))
print('MG-train.sim.tfidf.pkl: ' + str(os.path.getsize('MG-train.sim.tfidf.pkl')))
print('train.body.svd.pkl: ' + str(os.path.getsize('train.body.svd.pkl')))
print('train.sim.svd.pkl: ' + str(os.path.getsize('train.sim.svd.pkl')))
print('train.headline.word2vec.pkl: ' + str(os.path.getsize('train.headline.word2vec.pkl')))
print('train.body.word2vec.pkl: ' + str(os.path.getsize('train.body.word2vec.pkl')))
print('train.headline.senti.pkl: ' + str(os.path.getsize('train.headline.senti.pkl')))
print('train.body.senti.pkl: ' + str(os.path.getsize('train.body.senti.pkl')))

In [None]:
# combine features into numpy array

#arrays = [xBasicCountsTrain, headline_tfidf, body_tfidf, sim_tfidf, headline_svd, body_svd, sim_svd, headline_w2v, body_w2v, headline_senti,body_senti]

arrays = [xBasicCountsTrain,  sim_tfidf, headline_svd, body_svd, sim_svd, headline_w2v, body_w2v, headline_senti,body_senti]

In [None]:
for array in arrays:
    print(array.ndim)
    print(array.shape)
    print(type(array))

In [None]:
%%time
model_data = np.hstack(arrays)

In [None]:
model_data.shape

In [None]:
with open('all_model_data.pkl', 'wb') as all_data:
    pickle.dump(model_data, all_data, protocol = 4)

In [None]:
%%time
np.savez_compressed('model_data.npz', model_data)

In [None]:
print('test')

In [None]:
# import features and target data

target_y = copy.deepcopy(df_1['Stance']).to_numpy().reshape(-1,1)
target_y.shape
np.savez_compressed('model_target_data.npz', target_y)

In [3]:
# load data
from numpy import load
target_y = load('model_target_data.npz')
target_y = target_y['arr_0']
print(target_y.shape)
features_x =  load('model_data.npz')
features_x = features_x['arr_0']
print(features_x.shape)

(75385, 1)
(75385, 836)


In [10]:
X_train = features_x[0:60300]
y_train = target_y[0:60300]

X_test = features_x[60300:75386]
y_test = target_y[60300:75386]

In [6]:
import csv
import sys

## Score using scorer.py (provided in https://github.com/FakeNewsChallenge/fnc-1) on TEST set
#from scorer import score_submission, print_confusion_matrix, score_defaults, SCORE_REPORT
from score import report_score, LABELS, score_submission

In [9]:
boost_clf = xgb.XGBClassifier()

In [11]:
boost_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [13]:
predicted = [LABELS[int(a)] for a in boost_clf.predict(X_test)]
actual = [LABELS[int(a)] for a in y_test]

In [17]:
fold_score, _ = score_submission(actual, predicted)
max_fold_score, _ = score_submission(actual, actual)
score = fold_score/max_fold_score
print("Score for fold "+ str(fold) + " was - " + str(score))
if score > best_score:
    best_score = score
    best_fold = clf

NameError: name 'fold' is not defined

In [None]:

%%time
# instantiate XGBoost classifier
boost_clf = xgb.XGBClassifier()
## use stratefied kfold for classification task
kfold = StratifiedKFold(n_splits=10, random_state=1)

#boost_scores = cross_val_score(boost_clf, features_x, target_y, scoring='roc_auc', cv=kfold)
boost_scores = cross_val_predict(boost_clf, X_train, y_train, cv=kfold)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

In [None]:
# GridSearchcv
params_xgb = {

    'max_depth': [6],
    'colsample_bytree': [0.6],
    'subsample': [1.0],
    'eta': [0.1],
    'silent': [1],
    #'objective': 'multi:softmax',
    'objective': 'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class': 4
}

In [None]:
xgb_data = copy.deepcopy(df_2)

In [None]:
bcf_in = open('train.basic.pkl', 'rb')
basic_count_features = pickle.load(bcf_in)
basic_count_features = [basic_count_features]

In [None]:
type(basic_count_features)

In [None]:
print(basic_count_features[0:3])

# Deep Learning Predictions

1D CNN on Headline and articleBody (at word level).  Output of CNN sent to MLP with 4 class outputs (agree, disagree, discuss, unrelated)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, MaxPooling2D, Conv2D, Activation, Dropout, GlobalAveragePooling2D
from keras import optimizers
from keras import backend


In [None]:
nn_model = Sequential()
nn_model.add(Conv1D)