In [None]:
import sys
from this import s
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats, tfidf_features
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission

from utils.system import parse_params, check_version
from csv import DictReader
import pandas

import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from tqdm import tqdm
from nltk import tokenize
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Embedding,CuDNNLSTM,Bidirectional, Flatten



The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/margaretbrewster/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/margaretbrewster/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/margaretbrewster/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/margaretbrewster/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:


hyperparam = {
    'batch_size': 200,
    'max_vocab_size': 20000,
    'embedding_dim': 100,
    'dropout_rate': 0.3,
    'learning_rate': 0.1,
    'n_epochs': 10,
    'max_length': 100
}

## Returns the feature array used to train the sequential model
def generate_features(stances,dataset,name):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")
    X_tfidf = gen_or_load_feats(tfidf_features, h, b, "features/tfidf."+name+".npy")
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap, X_tfidf]

    return X,y


In [17]:
# check_version()
# parse_params()

#Load the training dataset and generate folds
d = DataSet()
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

# Load the competition dataset
competition_dataset = DataSet("competition_test")
X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")

h, b = [], []
for stance in competition_dataset.stances:
    h.append(stance['Headline'])
    b.append(stance['Body ID'])

answers = {'Headline': h, 'Body ID': b, 'Stance': []}

Xs = dict()
ys = dict()

# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
for fold in fold_stances:
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))


best_score = 0
best_fold = None

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


In [7]:
articles = d.articles.values()
sentences = []
for article in articles:
    sentences += tokenize.sent_tokenize(article)

In [18]:
# build tokenizer
word_seq = [text_to_word_sequence(sent) for sent in sentences]
token = Tokenizer(num_words=hyperparam['max_vocab_size'])
token.fit_on_texts([' '.join(seq[:hyperparam['max_length']]) for seq in word_seq])

In [19]:
#build glove embedding vector
embedding_vector = {}
f = open('./glove/glove.6B.100d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

400001it [00:10, 38954.78it/s]


In [20]:
## create embedding matrix from gloVe word vectors
## gloVe is avaliable for download at https://nlp.stanford.edu/projects/glove/
vocab_size = len(token.word_index.items()) + 1

embedding_matrix = np.zeros((vocab_size,100))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

100%|█████████████████████████████████| 27368/27368 [00:00<00:00, 307702.67it/s]


In [30]:
model = Sequential()
model.add(Embedding(
    input_dim=len(embedding_matrix),
    output_dim=hyperparam['embedding_dim'],
    weights=[embedding_matrix],
    trainable = True
))
model.add(Bidirectional(LSTM(100, return_sequences=False, name='Bidrectional_lstm_layer1')))
model.add(Flatten())
model.add(Dense(32,activation = 'relu'))
# model.add(Dense(1,activation = 'sigmoid'))
# model.add(Dense(1,activation = 'relu'))

##we tested drop out rates  from 0 to 0.6
model.add(Dropout(rate=0.1, name='dropout_1')) 
model.add(Dense(4,activation='softmax'))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics = ['accuracy'])

In [None]:
# Classifier for each fold
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))
    
    print(y_train)

    X_test = Xs[fold]
    y_test = ys[fold]
    
    ## divide into a larger train set and smaller test set 
    X_val = np.array(X_test[:(len(X_test) // 6)])
    y_val = np.array(y_test[:(len(X_test) // 6)])
    x_test = np.array(X_test[(len(X_test) // 6):])
    y_test = np.array(y_test[(len(X_test) // 6):])
    
    
    
#  new model
    model.fit(X_train, y_train,
              batch_size=hyperparam['batch_size'],
              epochs=10,
              validation_data=(X_val, y_val),
              verbose=1)
  
    predicted = [LABELS[np.argmax(a, axis = 0)] for a in model.predict(x_test)]
    actual = [LABELS[int(a)] for a in y_test]
    
    report_score(actual,predicted)

    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)

    score = fold_score/max_fold_score

    print("Score for fold "+ str(fold) + " was - " + str(score))
    if score > best_score:
        best_score = score
        best_fold = model

[3 3 3 ... 3 3 3]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[np.argmax(a, axis = -1)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]

print(best_fold.predict(X_holdout)) 
print(predicted) 

print(y_competition) 

print("Scores on the dev set")
report_score(actual,predicted)

#Run on competition dataset
predicted = [LABELS[np.argmax(a, axis = 0)] for a in best_fold.predict(X_competition)]
print(X_competition) 

answers["Stance"] = predicted
answers = pandas.DataFrame(answers)
answers.to_csv('answer.csv', index=False, encoding='utf-8')
actual = [LABELS[int(a)] for a in y_competition]
print("Scores on the test set")
report_score(actual,predicted)