In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import xapian
import nltk
import os
import io
from zipfile import ZipFile
import shutil
import csv
import re
import pandas as pd
import numpy as np
import json
import spacy 
import en_core_web_lg
import sklearn
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords
from nltk.chunk import conlltags2tree, tree2conlltags
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from joblib import dump, load
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
vectorizer = DictVectorizer()
nlp_large = en_core_web_lg.load() 

In [13]:

dbpath = "index_V6_nostop"


print("### GET TRAINING DATA ###")

train_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), "train.json")
with open(train_path) as json_file:  
    train = json.load(json_file)

rowid = list(train.keys())
claim = []
label = []
evidence = []
for idx in rowid:
    claim.append(train[idx]['claim'])
    label.append(train[idx]['label'])
    evidence.append(train[idx]['evidence'])
    
len_claim = len(claim)    

print("training set:",len_claim)

print("### GET DEVELOPMENT DATA ###")

dev_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), "devset.json")
with open(dev_path) as json_file:  
    development = json.load(json_file)
    
drowid = list(development.keys())
dclaim = []
dlabel = []
devidence = []
for idx in drowid:
    dclaim.append(development[idx]['claim'])
    dlabel.append(development[idx]['label'])
    devidence.append(development[idx]['evidence'])

len_dclaim= len(dclaim)    
print("development set:", len_dclaim)

print("### GET TEST DATA ###")

test_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), "test-unlabelled.json")
with open(test_path) as json_file:  
    test = json.load(json_file)
    
trowid = list(test.keys())
tclaim = []
for idx in trowid:
    tclaim.append(test[idx]['claim'])

len_tclaim = len(tclaim)    
print("test set:", len_tclaim)


### GET TRAINING DATA ###
training set: 145449
### GET DEVELOPMENT DATA ###
development set: 5001
### GET TEST DATA ###
test set: 14997


In [30]:
Counter(label)

Counter({'SUPPORTS': 80035, 'REFUTES': 29775, 'NOT ENOUGH INFO': 35639})

In [14]:
print("### CONNECT TO INDEX ###")

def getstopper():
    stopper = xapian.SimpleStopper()
    for s in stopwords.words('english'):
        stopper.add(s)
    return stopper

def get_doc_id(match):
    for term in match.document.termlist():
        term = term.term.decode("utf-8") 
        m = re.match("Q(.*)", term)
        if m:
            return m[1]
    return None

# Prepare enquiry object

# Open the database we're going to search.
db = xapian.Database(dbpath)

# Set up a QueryParser with a stemmer and suitable prefixes
queryparser = xapian.QueryParser()
queryparser.set_stemmer(xapian.Stem("en"))
queryparser.set_stemming_strategy(queryparser.STEM_SOME)
queryparser.set_stopper(getstopper())
queryparser.add_prefix('keywords', 'K')

# Use an Enquire object on the database to run the query
enquire = xapian.Enquire(db)

def get_match(query,pagesize):
    query = queryparser.parse_query(query)
    enquire.set_query(query)
    matches = enquire.get_mset(0, pagesize)

    query_results = []
    #doc_title = []
    for match in matches:
        result = dict(
            found_doc = get_doc_id(match),
            rank = match.rank + 1, 
            percent = match.percent,
            weight = match.weight,
            docid = match.docid,
            text = match.document.get_data()
        )
        query_results.append(result)
        #doc_title.append(get_doc_id(match))
    return query_results #, doc_title



### CONNECT TO INDEX ###


In [15]:

print("### NORMALIZATION ###")
    
def replace_punctuation(sentence):
    text = re.sub(r"(-LRB-|-LSB-|-RSB-|-RRB-|-COLON-|-lrb-|-lsb-|-rsb-|-rrb-|-colon-|``|'')*", "", sentence)
    text = re.sub(r"[`']+", " ", text)
    text = re.sub(r"[.,:;]*", "", text)
    text = text.replace("_"," ")
    text = text.replace("\\n","")
    text = text.strip()
    return text

def get_entity(document):
    doc = nlp_large(document)
    combine_entity = ''
    for entity in doc.ents:
        combine_entity = combine_entity + ' ' + str(entity)
    return combine_entity.strip()

def remove_stopwords(wordlist):
    filtered = [w for w in wordlist if w not in stopwords.words('english')]
    return filtered

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma  

def normalize_counter(document):  
    entities = get_entity(document)
    combine = document + ' ' + entities
    filtered = nltk.word_tokenize(combine)
    for i in range(len(filtered)):
         filtered[i] = lemmatize(filtered[i].lower())
    word_counter = Counter(filtered)
    return word_counter 


### NORMALIZATION ###


In [16]:
print("### GET FEATURE LIST ###")
      
      
feature_list_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), "feature_list_complete.json")
with open(feature_list_path) as json_file:  
    feature_list_dict = json.load(json_file)
    
feature_list = []
claim_index = []
for key in feature_list_dict:
    feature_list.append(feature_list_dict[key])
    claim_index.append(int(key))
    
label_list = []
for i in claim_index:
    label_list.append(label[i])
    
weight_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), "weight_complete.json")
with open(weight_path) as json_file:  
    weight_dict = json.load(json_file)
    
weight_list = []
for i in claim_index:
    weight_list.append(weight_dict[str(i)])   

### GET FEATURE LIST ###


In [17]:
 print("### RANDOM LIST ###")
          
    
import random

combined = list(zip(claim_index, feature_list, label_list, weight_list))
random.shuffle(combined)
claim_index, feature_list, label_list, weight_list = zip(*combined)

training_set = feature_list[:120000]
training_label = label_list[:120000]
heldout_set = feature_list[120000:]
heldout_label = label_list[120000:]

print("### RUN VECTORIZER ###")

vectorizer = DictVectorizer()
training_data = vectorizer.fit_transform(training_set)
heldout_data = vectorizer.transform(heldout_set)


### RANDOM LIST ###
### RUN VECTORIZER ###


In [9]:
print("### TRAIN THE MODEL ###")

### MLP Classifier 2 hidden layers 10 nodes & 5 nodes ###

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 5), random_state=1)
clf.fit(training_data, training_label) 


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [10]:

predictions = clf.predict(heldout_data)

print(accuracy_score(heldout_label,predictions))
results = classification_report(heldout_label,predictions,  output_dict=True)

print(results)

with open('result1_10_5.json', 'w') as f:
    json.dump(results, f, indent = 4)


0.8585141251755892
{'NOT ENOUGH INFO': {'precision': 0.9997062279670975, 'recall': 0.9988259465805694, 'f1-score': 0.9992658934077228, 'support': 3407}, 'REFUTES': {'precision': 0.7189859762675297, 'recall': 0.5087786259541984, 'f1-score': 0.5958873491282968, 'support': 2620}, 'SUPPORTS': {'precision': 0.829142403388036, 'recall': 0.9230882569618388, 'f1-score': 0.8735968765251343, 'support': 6787}, 'accuracy': 0.8585141251755892, 'macro avg': {'precision': 0.8492782025408877, 'recall': 0.8102309431655356, 'f1-score': 0.8229167063537179, 'support': 12814}, 'weighted avg': {'precision': 0.8519690860230552, 'recall': 0.8585141251755892, 'f1-score': 0.8502283248425422, 'support': 12814}}


In [35]:
dump(clf, 'MLPClassifier_lbfgs_10_5.joblib') 

['MLPClassifier_lbfgs_10_5.joblib']

In [None]:
### MLP Classifier 2 hidden layers 10 nodes & 10 nodes ###

clf2 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 10), random_state=1)

clf2.fit(training_data, training_label) 

predictions2 = clf2.predict(heldout_data)

In [12]:
accuracy2 = accuracy_score(heldout_label,predictions2)
results2 = classification_report(heldout_label,predictions2,  output_dict=True)

print(accuracy2)
print(results2)

with open('result1_10_10.json', 'w') as f:
    json.dump(results2, f, indent = 4)

0.8597627594818168
{'NOT ENOUGH INFO': {'precision': 0.998533724340176, 'recall': 0.9994129732902847, 'f1-score': 0.9989731553469268, 'support': 3407}, 'REFUTES': {'precision': 0.7254366812227074, 'recall': 0.5072519083969466, 'f1-score': 0.5970350404312668, 'support': 2620}, 'SUPPORTS': {'precision': 0.8297675647120972, 'recall': 0.9257403860321203, 'f1-score': 0.875130580123964, 'support': 6787}, 'accuracy': 0.8597627594818168, 'macro avg': {'precision': 0.8512459900916601, 'recall': 0.8108017559064505, 'f1-score': 0.8237129253007192, 'support': 12814}, 'weighted avg': {'precision': 0.8533073954527451, 'recall': 0.8597627594818168, 'f1-score': 0.8511974866160639, 'support': 12814}}


In [36]:
dump(clf2, 'MLPClassifier_lbfgs_10_10.joblib') 

['MLPClassifier_lbfgs_10_10.joblib']

In [21]:
### MLP Classifier 1 hidden layers 100 nodes ###

clf3 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,), random_state=1)

clf3.fit(training_data, training_label) 

predictions3 = clf3.predict(heldout_data)

In [22]:
accuracy3 = accuracy_score(heldout_label,predictions3)
results3 = classification_report(heldout_label,predictions3,  output_dict=True)

print(accuracy3)
print(results3)

with open('result1_100.json', 'w') as f:
    json.dump(results3, f, indent = 4)

0.8667863274543468
{'NOT ENOUGH INFO': {'precision': 0.9974055923897377, 'recall': 0.9979809633689068, 'f1-score': 0.9976931949250288, 'support': 3467}, 'REFUTES': {'precision': 0.7014242115971516, 'recall': 0.5522627152583099, 'f1-score': 0.6179699753529017, 'support': 2497}, 'SUPPORTS': {'precision': 0.8494375931698062, 'recall': 0.915036496350365, 'f1-score': 0.8810176400309228, 'support': 6850}, 'accuracy': 0.8667863274543468, 'macro avg': {'precision': 0.8494224657188986, 'recall': 0.8217600583258605, 'f1-score': 0.8322269367696178, 'support': 12814}, 'weighted avg': {'precision': 0.8606296986410552, 'recall': 0.8667863274543468, 'f1-score': 0.861326999334563, 'support': 12814}}


In [31]:
dump(clf3, 'MLPClassifier_lbfgs_100.joblib') 

['MLPClassifier_lbfgs_100.joblib']

In [32]:
MLPClassifier_lbfgs_100 = load('MLPClassifier_lbfgs_100.joblib') 

In [33]:
predictions3_ = MLPClassifier_lbfgs_100.predict(heldout_data)
accuracy3_ = accuracy_score(heldout_label,predictions3_)
results3_ = classification_report(heldout_label,predictions3_,  output_dict=True)

print(accuracy3_)
print(results3_)

0.8637427813329172
{'NOT ENOUGH INFO': {'precision': 0.9988249118683902, 'recall': 0.9979454065159965, 'f1-score': 0.9983849654969902, 'support': 3407}, 'REFUTES': {'precision': 0.71338199513382, 'recall': 0.5595419847328245, 'f1-score': 0.6271657754010697, 'support': 2620}, 'SUPPORTS': {'precision': 0.8432358939496941, 'recall': 0.9138058052158539, 'f1-score': 0.8771036628482534, 'support': 6787}, 'accuracy': 0.8637427813329172, 'macro avg': {'precision': 0.851814266983968, 'recall': 0.8237643988215583, 'f1-score': 0.8342181345821045, 'support': 12814}, 'weighted avg': {'precision': 0.8580536377573581, 'recall': 0.8637427813329172, 'f1-score': 0.8582467979358626, 'support': 12814}}


In [15]:
claim_index_refutes = []
for i in claim_index:
    if label[i] == 'REFUTES':
        claim_index_refutes.append(i)


In [16]:
len(claim_index_refutes)

26395

In [17]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler




In [18]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(training_data, training_label)

In [22]:
print(Counter(y_resampled))
print(len(y_resampled))

Counter({'NOT ENOUGH INFO': 63993, 'SUPPORTS': 63993, 'REFUTES': 63993})
191979


In [23]:
### MLP Classifier 1 hidden layers 20 nodes ###

clf4 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(20,), random_state=1)

clf4.fit(training_data, training_label) 

predictions4 = clf4.predict(heldout_data)

In [26]:
accuracy4 = accuracy_score(heldout_label,predictions4)
results4 = classification_report(heldout_label,predictions4,  output_dict=True)

print(accuracy4)
print(results4)

with open('result1_20.json', 'w') as f:
    json.dump(results4, f, indent = 4)

0.8597627594818168
{'NOT ENOUGH INFO': {'precision': 0.9982409850483729, 'recall': 0.9994129732902847, 'f1-score': 0.9988266353769434, 'support': 3407}, 'REFUTES': {'precision': 0.7205333333333334, 'recall': 0.5156488549618321, 'f1-score': 0.6011123470522803, 'support': 2620}, 'SUPPORTS': {'precision': 0.8316950053134963, 'recall': 0.9224988949462207, 'f1-score': 0.8747467691232974, 'support': 6787}, 'accuracy': 0.8597627594818168, 'macro avg': {'precision': 0.8501564412317341, 'recall': 0.8125202410661125, 'f1-score': 0.824895250517507, 'support': 12814}, 'weighted avg': {'precision': 0.853247882820028, 'recall': 0.8597627594818168, 'f1-score': 0.85178890417091, 'support': 12814}}


In [39]:
dump(clf4, 'MLPClassifier_lbfgs_20.joblib') 

['MLPClassifier_lbfgs_20.joblib']

In [27]:
### MLP Classifier 1 hidden layers 20 nodes ###

clf5 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(20,), random_state=1)

clf5.fit(X_resampled, y_resampled) 

predictions5 = clf5.predict(heldout_data)

In [28]:
accuracy5 = accuracy_score(heldout_label,predictions5)
results5 = classification_report(heldout_label,predictions5,  output_dict=True)

print(accuracy5)
print(results5)

with open('result1_20_bootstrap.json', 'w') as f:
    json.dump(results5, f, indent = 4)

0.8273763071640393
{'NOT ENOUGH INFO': {'precision': 0.9991186839012925, 'recall': 0.9982389198708541, 'f1-score': 0.9986786081339011, 'support': 3407}, 'REFUTES': {'precision': 0.5674525212835625, 'recall': 0.6614503816793893, 'f1-score': 0.6108565385971096, 'support': 2620}, 'SUPPORTS': {'precision': 0.8602894902454373, 'recall': 0.8056578753499337, 'f1-score': 0.8320779121966065, 'support': 6787}, 'accuracy': 0.8273763071640393, 'macro avg': {'precision': 0.8089535651434309, 'recall': 0.821782392300059, 'f1-score': 0.813871019642539, 'support': 12814}, 'weighted avg': {'precision': 0.837326965202936, 'recall': 0.8273763071640393, 'f1-score': 0.8311421054405334, 'support': 12814}}


In [40]:
dump(clf5, 'MLPClassifier_lbfgs_20_bootstrap.joblib') 

['MLPClassifier_lbfgs_20_bootstrap.joblib']

In [19]:
### MLP Classifier 3 hidden layers @ 10 nodes ###

clf6 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10,10,10), random_state=1)

clf6.fit(training_data, training_label) 

predictions6 = clf6.predict(heldout_data)

In [20]:
accuracy6 = accuracy_score(heldout_label,predictions6)
results6 = classification_report(heldout_label,predictions6,  output_dict=True)

print(accuracy6)
print(results6)

0.8614015920087404
{'NOT ENOUGH INFO': {'precision': 0.9959689029657357, 'recall': 0.997692529564465, 'f1-score': 0.9968299711815563, 'support': 3467}, 'REFUTES': {'precision': 0.6842900302114804, 'recall': 0.5442531037244693, 'f1-score': 0.60629043051528, 'support': 2497}, 'SUPPORTS': {'precision': 0.8456832087015635, 'recall': 0.908029197080292, 'f1-score': 0.8757479760647661, 'support': 6850}, 'accuracy': 0.8614015920087404, 'macro avg': {'precision': 0.8419807139595932, 'recall': 0.8166582767897421, 'f1-score': 0.8262894592538674, 'support': 12814}, 'weighted avg': {'precision': 0.8548951437198362, 'recall': 0.8614015920087404, 'f1-score': 0.8560004956396721, 'support': 12814}}


In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False,norm=None)
training_matrix_tfidf = transformer.fit_transform(training_data)

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2000)
training_matrix_lowrank = svd.fit_transform(training_matrix_tfidf)

heldout_data = vectorizer.transform(heldout_set)
heldout_matrix_tfidf = transformer.fit_transform(heldout_data)
heldout_matrix_lowrank = svd.fit_transform(heldout_matrix_tfidf)


  idf = np.log(n_samples / df) + 1


In [46]:
#MLP Classifier 1 hidden layers 100 nodes on tfidf ###

clf3.fit(training_matrix_tfidf, training_label) 


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [47]:
predictions7 = clf3.predict(heldout_matrix_tfidf)

accuracy7 = accuracy_score(heldout_label,predictions7)
results7 = classification_report(heldout_label,predictions7,  output_dict=True)

print(accuracy7)
print(results7)

0.8708443889495864
{'NOT ENOUGH INFO': {'precision': 0.9976838448176027, 'recall': 0.9939428901067205, 'f1-score': 0.9958098540673312, 'support': 3467}, 'REFUTES': {'precision': 0.69124218051832, 'recall': 0.6195434521425711, 'f1-score': 0.6534318901795143, 'support': 2497}, 'SUPPORTS': {'precision': 0.8657680426846391, 'recall': 0.9001459854014598, 'f1-score': 0.8826223876324076, 'support': 6850}, 'accuracy': 0.8708443889495864, 'macro avg': {'precision': 0.8515646893401873, 'recall': 0.8378774425502504, 'f1-score': 0.8439547106264177, 'support': 12814}, 'weighted avg': {'precision': 0.8674506560891722, 'recall': 0.8708443889495864, 'f1-score': 0.8685855743024565, 'support': 12814}}


In [48]:
dump(clf3, 'MLPClassifier_lbfgs_100_tfidf.joblib') 

['MLPClassifier_lbfgs_100_tfidf.joblib']

In [26]:
#MLP Classifier 1 hidden layers 100 nodes on tfidf ###

clf3.fit(training_matrix_lowrank, training_label) 


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [27]:
predictions8 = clf3.predict(heldout_matrix_lowrank)
accuracy8 = accuracy_score(heldout_label,predictions8)
results8 = classification_report(heldout_label,predictions8,  output_dict=True)

print(accuracy8)
print(results8)

0.444982050881848
{'NOT ENOUGH INFO': {'precision': 0.4765450483991065, 'recall': 0.5537929045284107, 'f1-score': 0.512273212379936, 'support': 3467}, 'REFUTES': {'precision': 0.23294723294723294, 'recall': 0.36243492190628757, 'f1-score': 0.28361015355687874, 'support': 2497}, 'SUPPORTS': {'precision': 0.5871428571428572, 'recall': 0.42, 'f1-score': 0.4897021276595745, 'support': 6850}, 'accuracy': 0.444982050881848, 'macro avg': {'precision': 0.43221171282973225, 'recall': 0.4454092754782328, 'f1-score': 0.4285284978654631, 'support': 12814}, 'weighted avg': {'precision': 0.4881988055952485, 'recall': 0.444982050881848, 'f1-score': 0.45564892736232626, 'support': 12814}}


In [None]:
# vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
# bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
# ...                                     token_pattern=r'\b\w+\b', min_df=1)

<h3> Test Development on Top 5 Sentences By Cosine Similarity</h3>

In [18]:
def get_match2(query,pagesize):
    query = queryparser.parse_query(query)
    enquire.set_query(query)
    matches = enquire.get_mset(0, pagesize)

    query_results = []
    doc_title = []
    for match in matches:
        result = dict(
            found_doc = get_doc_id(match),
            rank = match.rank + 1, 
            percent = match.percent,
            weight = match.weight,
            docid = match.docid,
            text = match.document.get_data()
        )
        query_results.append(result)
        doc_title.append(get_doc_id(match))
    return query_results , doc_title

def similarity(claim, wiki):     
    claim_wiki = [claim,wiki]
    vectorizer = CountVectorizer(claim_wiki)
    try:
        vectorizer.fit(claim_wiki)
        vectors = [vector for vector in vectorizer.transform(claim_wiki).toarray()]
        similarity = cosine_similarity(vectors)[1][0]
    except:
        similarity = 0
    return similarity

In [23]:
def get_entity_list(document):
    doc = nlp_large(document)
    combine_entity = set()
    for entity in doc.ents:
        combine_entity.add(str(entity).lower().strip())
    return list(combine_entity)

def capital(text):
    result = re.findall(r"[A-Z][\S]+", text)
    result = [w.lower() for w in result]
    return result

def entity_capital(text):
    text = replace_punctuation(text)
    entities = get_entity_list(text)
    capitals = capital(text)
    entities.extend(capitals)
    entities = set(entities)
    entities = remove_stopwords(entities)
    return entities

def document_entities(result):
    docid = result['found_doc']
    pattern = re.compile(r"("+docid+"\s\d+\s)")
    doc = result['text'].decode("utf-8")
    doc = replace_punctuation(doc)
    doc = re.sub(pattern,"",doc)
    doc_entities = entity_capital(doc)
    return doc_entities

def intersection(list1, list2): 
    list3 = [value for value in list1 if value in list2] 
    return len(list3) 

def similarity_list(claim, wiki):     
    claim_wiki = [' '.join(claim).strip(),' '.join(wiki).strip()]
    vectorizer = CountVectorizer(claim_wiki)
    try:
        vectorizer.fit(claim_wiki)
        vectors = [vector for vector in vectorizer.transform(claim_wiki).toarray()]
        similarity = cosine_similarity(vectors)[1][0]
    except:
        similarity = 0
    return similarity

In [37]:
print("### GET FEATURE LIST ###")
      
      
feature_list_path2 = os.path.join(os.path.dirname(os.path.realpath('__file__')), "features_fulldoc.json")
with open(feature_list_path) as json_file:  
    feature_list_dict2 = json.load(json_file)
    
feature_list2 = []
claim_index2 = []
for key in feature_list_dict2:
    feature_list2.append(feature_list_dict2[key])
    claim_index2.append(int(key))
    
label_list2 = []
for i in claim_index2:
    label_list2.append(label[i])
    
weight_path2 = os.path.join(os.path.dirname(os.path.realpath('__file__')), "weight_fulldoc.json")
with open(weight_path2) as json_file:  
    weight_dict2 = json.load(json_file)
    
weight_list2 = []
for i in claim_index2:
    weight_list2.append(weight_dict2[str(i)])   

### GET FEATURE LIST ###


In [38]:
 print("### RANDOM LIST ###")
          
    
import random

combined2 = list(zip(claim_index2, feature_list2, label_list2, weight_list2))
random.shuffle(combined2)
claim_index2, feature_list2, label_list2, weight_list2 = zip(*combined2)

training_set2 = feature_list2[:120000]
training_label2 = label_list2[:120000]
heldout_set2 = feature_list2[120000:]
heldout_label2 = label_list2[120000:]

print("### RUN VECTORIZER ###")

vectorizer2 = DictVectorizer()
training_data2 = vectorizer2.fit_transform(training_set2)
heldout_data2 = vectorizer2.transform(heldout_set2)

### RANDOM LIST ###
### RUN VECTORIZER ###


In [None]:
### MLP Classifier 1 hidden layers 100 nodes ###

clf_100 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,), random_state=1)

clf_100.fit(training_data2, training_label2) 

predictions_100 = clf_100.predict(heldout_data2)

accuracy_100 = accuracy_score(heldout_label2,predictions_100)
results_100 = classification_report(heldout_label2,predictions_100,  output_dict=True)

print(accuracy_100)
print(results_100)

<h3> Document and Sentence Selection </h3>

In [19]:
import spacy
import en_vectors_web_lg

In [20]:
nlp_vector = en_vectors_web_lg.load() 

In [None]:
features_dev = defaultdict(Counter)
evidence_dev = []
evidence_weight = []
features_sentence = []
for i in range(len(dclaim)):
#for i in range(0,20):    
    norm_query = replace_punctuation(dclaim[i].lower())  
    claim_entities = entity_capital(dclaim[i])
    claim_vector = nlp_vector(norm_query)
#     query = norm_query + ' ' + ' '.join(claim_entities).strip()
#     query = ' '.join(remove_stopwords(query.split())).strip()
    query_results, doc_title  = get_match2(norm_query,50)

    #print(doc_title)
    docid_chosen = []
    
    # get from IR first #
    for ir in range(0,2):
        docid_chosen.append(doc_title[ir])
        
    len_ir = len(docid_chosen)
    
    #print(docid_chosen)    
    
    # get from title and entities #
    
    title_similarity = []
    doc_title_title = []
    entity_similarity = []
    doc_title_entity = []
    for j in range(len(doc_title)):
        norm_doc_title = doc_title[j].replace("_"," ")
        title_entities = entity_capital(norm_doc_title)
        cos_sim_entity = similarity_list(claim_entities,title_entities) 
        cos_sim_token = similarity(norm_query,norm_doc_title.lower()) 
        if cos_sim_token > 0:
            title_similarity.append(cos_sim_token)
            doc_title_title.append(doc_title[j])
        if cos_sim_entity > 0:
            entity_similarity.append(cos_sim_entity)
            doc_title_entity.append(doc_title[j])
        
        #print(doc_title[j], title_entities, cos_sim_entity, cos_sim_token) 
    
    if len(entity_similarity) > 0:
        
        combined_entity = list(zip(entity_similarity,doc_title_entity))
        
        # rerank based on entities and capital words
    
        combined_entity = sorted(combined_entity, key=lambda x: x[0], reverse=True)   
#         print("")       
#         print(combined_entity)
        limit = min(len(entity_similarity),2)
        entity_filtered = combined_entity[:limit]
        entity_similarity, docid_entity = list(zip(*entity_filtered))   

        for ent in range(0,limit):
            if docid_entity[ent] not in docid_chosen:
                docid_chosen.append(docid_entity[ent])

        len_entity = len(docid_chosen)
        
    if len(title_similarity) > 0:    
        combined_title = list(zip(title_similarity,doc_title_title))

        # rerank based on title tokens

        combined_title = sorted(combined_title, key=lambda x: x[0], reverse=True)   
        
#         print("")      
#         print("sorted title")
#         print(combined_title)
        limit = min(len(title_similarity),2)
        title_filtered = combined_title[:limit]
        title_similarity, docid_title = list(zip(*title_filtered))

        #print(docid_title)

        for tl in range(0,limit):
            if docid_title[tl] not in docid_chosen:
                docid_chosen.append(docid_title[tl])   

        len_title = len(docid_chosen)
    
    results = []
#     print(dclaim[i],devidence[i])
#     print(devidence[i])
#     print(docid_chosen)
    
    for m in range(len(query_results)):
        if query_results[m]['found_doc'] in docid_chosen:             
            results.append(query_results[m])       
    #print(results)
    evidence_list = []
    sim_score = []
    sentence_list = []
    weight_list = []
    for k in range(len(results)):
        doc_id = results[k]['found_doc']
        weight = results[k]['weight']        
        text = results[k]['text'].decode("utf-8").lower()           
        textsplit = text.split('\\n')  
#         print(len(textsplit))
#         print(textsplit)
        if len(textsplit) > 0:
            for l in range(len(textsplit)-1):
                norm_split = replace_punctuation(textsplit[l]).strip()  
                sentence_vector = nlp_vector(norm_split)
                confidence = claim_vector.similarity(sentence_vector)
                sent_id = textsplit[l].split(maxsplit=2)[1]         
                evidence_doc_sent = [doc_id,int(sent_id)]
                evidence_list.append(evidence_doc_sent)
                sim_score.append(confidence)
                sentence_list.append(norm_split)  
                weight_list.append(weight)
                            
                    
#     print(len(evidence_list))
#     print("len sim score", len(sim_score))
    
    combine_sent = list(zip(sim_score, evidence_list, sentence_list, weight_list))
#    print(combine_sent)
    combine_sent = sorted(combine_sent, key=lambda x: x[0], reverse=True)
    filtered = list(filter(lambda elems: elems[0] >= 0.9, combine_sent))
    limit = min(len(filtered),5)
    if limit != 0:
        source = "threshold simscore 0.9"
        filtered = filtered[:limit]
        sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered))
    else:
        filtered2 = list(filter(lambda elems: elems[0] >= 0.8, combine_sent))
        limit = min(len(filtered2),5)
        if limit != 0:
            source = "threshold simscore 0.8"
            filtered2 = filtered2[:limit]
            sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered2))
        else:
            filtered3 = list(filter(lambda elems: elems[0] >= 0.7, combine_sent))
            limit = min(len(filtered3),5)
            if limit != 0:
                source = "threshold simscore 0.7"
                filtered3 = filtered3[:limit]
                sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered3))
            else:
                filtered4 = list(filter(lambda elems: elems[0] >= 0.6, combine_sent))
                limit = min(len(filtered4),5)
                if limit != 0:
                    source = "threshold simscore 0.6"
                    filtered4 = filtered4[:limit]
                    sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered4))
                else:
                    source = "highest simscore"
                    combine_sent = combine_sent[:5]
                    sim_score, evidence_list, sentence_list, weight_list = list(zip(*combine_sent))

#     print(evidence_list)
    for idx in range(len(sentence_list)):
#         print(sim_score[idx], evidence_list[idx])
       
        counter_norm = normalize_counter(sentence_list[idx])  
        features_dev[i].update(counter_norm)   
        
    counter_claim = normalize_counter(norm_query) 
    features_dev[i].update(counter_claim)
    evidence_dev.append(evidence_list)
    evidence_weight.append(list(set(weight_list)))
    print(i, "doc:",len(results), "IR:",len_ir, "-",source, "-","entity:", len_entity, "title:",len_title, "evidence:",len(evidence_list))    
#     print("")


In [31]:
dclaim_index = list(features_dev.keys())
dlabel_list = []
for i in dclaim_index:
    dlabel_list.append(dlabel[i])
    
print(len(features_dev))
print(len(dclaim_index))
print(len(dlabel_list))
print(len(evidence_dev))

5001
5001
5001
5001


In [32]:
feature_list_dev = []
for key in features_dev:
    feature_list_dev.append(features_dev[key])
    
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False,norm=None)

vectorizer = DictVectorizer()
training_data = vectorizer.fit_transform(training_set)
dev_data = vectorizer.transform(feature_list_dev)

training_matrix_tfidf = transformer.fit_transform(training_data)
dev_matrix_tfidf = transformer.fit_transform(dev_data)    
    

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False,norm=None)

vectorizer = DictVectorizer()
training_data = vectorizer.fit_transform(training_set)
dev_data = vectorizer.transform(feature_list_dev)

training_matrix_tfidf = transformer.fit_transform(training_data)
dev_matrix_tfidf = transformer.fit_transform(dev_data)

  idf = np.log(n_samples / df) + 1


In [35]:
clf3 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,), random_state=1)

clf3.fit(training_data, training_label) 

predictions = clf3.predict(dev_matrix_tfidf)
#predictions
print(accuracy_score(dlabel_list,predictions))
print(classification_report(dlabel_list,predictions))

0.35692861427714456
                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.33      0.67      0.44      1667
        REFUTES       0.45      0.19      0.27      1667
       SUPPORTS       0.39      0.21      0.27      1667

       accuracy                           0.36      5001
      macro avg       0.39      0.36      0.33      5001
   weighted avg       0.39      0.36      0.33      5001



In [36]:
dev_set_predictions = {}
for i in range(len(drowid)):
    dev_set_predictions[drowid[i]] = {"claim": dclaim[i], "label":predictions[i], "evidence":evidence_dev[i]}
    
with open('dev_set_predictions17.json', 'w') as f:
    json.dump(dev_set_predictions, f, indent = 4) 

In [39]:

dev_data2 = vectorizer2.transform(feature_list_dev)

training_matrix_tfidf2 = transformer.fit_transform(training_data2)
dev_matrix_tfidf2 = transformer.fit_transform(dev_data2)


clf_100 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,), random_state=1)

clf_100.fit(training_matrix_tfidf2, training_label2) 

predictions2 = clf_100.predict(dev_matrix_tfidf2)
#predictions
print(accuracy_score(dlabel_list,predictions2))
print(classification_report(dlabel_list,predictions2))

  idf = np.log(n_samples / df) + 1


0.3639272145570886
                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.34      0.67      0.45      1667
        REFUTES       0.42      0.18      0.25      1667
       SUPPORTS       0.42      0.24      0.31      1667

       accuracy                           0.36      5001
      macro avg       0.39      0.36      0.33      5001
   weighted avg       0.39      0.36      0.33      5001



In [40]:
dev_set_predictions2 = {}
for i in range(len(drowid)):
    dev_set_predictions2[drowid[i]] = {"claim": dclaim[i], "label":predictions2[i], "evidence":evidence_dev[i]}
    
with open('dev_set_predictions18.json', 'w') as f:
    json.dump(dev_set_predictions2, f, indent = 4) 

In [41]:

clf_200 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(200,), random_state=1)

clf_200.fit(training_matrix_tfidf2, training_label2) 

predictions3 = clf_200.predict(dev_matrix_tfidf2)
#predictions
print(accuracy_score(dlabel_list,predictions3))
print(classification_report(dlabel_list,predictions3))

0.36112777444511096
                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.33      0.57      0.42      1667
        REFUTES       0.40      0.23      0.29      1667
       SUPPORTS       0.40      0.28      0.33      1667

       accuracy                           0.36      5001
      macro avg       0.38      0.36      0.35      5001
   weighted avg       0.38      0.36      0.35      5001



In [42]:

clf_300 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(300,), random_state=1)

clf_300.fit(training_matrix_tfidf2, training_label2) 

predictions4 = clf_300.predict(dev_matrix_tfidf2)
#predictions
print(accuracy_score(dlabel_list,predictions4))
print(classification_report(dlabel_list,predictions4))

0.3667266546690662
                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.34      0.63      0.44      1667
        REFUTES       0.42      0.22      0.29      1667
       SUPPORTS       0.41      0.24      0.31      1667

       accuracy                           0.37      5001
      macro avg       0.39      0.37      0.35      5001
   weighted avg       0.39      0.37      0.35      5001



In [43]:
dev_set_predictions4 = {}
for i in range(len(drowid)):
    dev_set_predictions4[drowid[i]] = {"claim": dclaim[i], "label":predictions4[i], "evidence":evidence_dev[i]}
    
with open('dev_set_predictions20.json', 'w') as f:
    json.dump(dev_set_predictions4, f, indent = 4) 

In [44]:
dump(clf_300, 'MLPClassifier_lbfgs_300.joblib') 

['MLPClassifier_lbfgs_300.joblib']

In [None]:
features_dev = defaultdict(Counter)
evidence_dev = []
evidence_weight = []
features_sentence = []
for i in range(len(dclaim)):
#for i in range(0,20):    
    norm_query = replace_punctuation(dclaim[i].lower())  
    claim_entities = entity_capital(dclaim[i])
    claim_vector = nlp_vector(norm_query)
#     query = norm_query + ' ' + ' '.join(claim_entities).strip()
#     query = ' '.join(remove_stopwords(query.split())).strip()
    query_results, doc_title  = get_match2(norm_query,50)

    #print(doc_title)
    docid_chosen = []
    
    # get from IR first #
    for ir in range(0,2):
        docid_chosen.append(doc_title[ir])
        
    len_ir = len(docid_chosen)
    
    #print(docid_chosen)    
    
    # get from title and entities #
    
    title_similarity = []
    doc_title_title = []
    entity_similarity = []
    doc_title_entity = []
    for j in range(len(doc_title)):
        norm_doc_title = doc_title[j].replace("_"," ")
        title_entities = entity_capital(norm_doc_title)
        cos_sim_entity = similarity_list(claim_entities,title_entities) 
        cos_sim_token = similarity(norm_query,norm_doc_title.lower()) 
        if cos_sim_token > 0:
            title_similarity.append(cos_sim_token)
            doc_title_title.append(doc_title[j])
        if cos_sim_entity > 0:
            entity_similarity.append(cos_sim_entity)
            doc_title_entity.append(doc_title[j])
        
        #print(doc_title[j], title_entities, cos_sim_entity, cos_sim_token) 
    
    if len(entity_similarity) > 0:
        
        combined_entity = list(zip(entity_similarity,doc_title_entity))
        
        # rerank based on entities and capital words
    
        combined_entity = sorted(combined_entity, key=lambda x: x[0], reverse=True)   
#         print("")       
#         print(combined_entity)
        limit = min(len(entity_similarity),2)
        entity_filtered = combined_entity[:limit]
        entity_similarity, docid_entity = list(zip(*entity_filtered))   

        for ent in range(0,limit):
            if docid_entity[ent] not in docid_chosen:
                docid_chosen.append(docid_entity[ent])

        len_entity = len(docid_chosen)
        
    if len(title_similarity) > 0:    
        combined_title = list(zip(title_similarity,doc_title_title))

        # rerank based on title tokens

        combined_title = sorted(combined_title, key=lambda x: x[0], reverse=True)   
        
#         print("")      
#         print("sorted title")
#         print(combined_title)
        limit = min(len(title_similarity),2)
        title_filtered = combined_title[:limit]
        title_similarity, docid_title = list(zip(*title_filtered))

        #print(docid_title)

        for tl in range(0,limit):
            if docid_title[tl] not in docid_chosen:
                docid_chosen.append(docid_title[tl])   

        len_title = len(docid_chosen)
    
    results = []
#     print(dclaim[i],devidence[i])
#     print(devidence[i])
#     print(docid_chosen)
    
    for m in range(len(query_results)):
        if query_results[m]['found_doc'] in docid_chosen:             
            results.append(query_results[m])       
    #print(results)
    evidence_list = []
    sim_score = []
    sentence_list = []
    weight_list = []
    for k in range(len(results)):
        doc_id = results[k]['found_doc']
        weight = results[k]['weight']        
        text = results[k]['text'].decode("utf-8").lower()           
        textsplit = text.split('\\n')  
#         print(len(textsplit))
#         print(textsplit)
        if len(textsplit) > 0:
            for l in range(len(textsplit)-1):
                norm_split = replace_punctuation(textsplit[l]).strip()  
                sentence_vector = nlp_vector(norm_split)
                confidence = claim_vector.similarity(sentence_vector)
                sent_id = textsplit[l].split(maxsplit=2)[1]         
                evidence_doc_sent = [doc_id,int(sent_id)]
                evidence_list.append(evidence_doc_sent)
                sim_score.append(confidence)
                sentence_list.append(norm_split)  
                weight_list.append(weight)
                            
                    
#     print(len(evidence_list))
#     print("len sim score", len(sim_score))
    
    combine_sent = list(zip(sim_score, evidence_list, sentence_list, weight_list))
#    print(combine_sent)
    combine_sent = sorted(combine_sent, key=lambda x: x[0], reverse=True)
    filtered = list(filter(lambda elems: elems[0] >= 0.95, combine_sent))
    limit = min(len(filtered),5)
    if limit != 0:
        source = "threshold simscore 0.95"
        filtered = filtered[:limit]
        sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered))
    else:
        filtered2 = list(filter(lambda elems: elems[0] >= 0.9, combine_sent))
        limit = min(len(filtered2),5)
        if limit != 0:
            source = "threshold simscore 0.9"
            filtered2 = filtered2[:limit]
            sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered2))
        else:
            filtered3 = list(filter(lambda elems: elems[0] >= 0.85, combine_sent))
            limit = min(len(filtered3),5)
            if limit != 0:
                source = "threshold simscore 0.85"
                filtered3 = filtered3[:limit]
                sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered3))
            else:
                filtered4 = list(filter(lambda elems: elems[0] >= 0.8, combine_sent))
                limit = min(len(filtered4),5)
                if limit != 0:
                    source = "threshold simscore 0.8"
                    filtered4 = filtered4[:limit]
                    sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered4))
                else:
                    filtered5 = list(filter(lambda elems: elems[0] >= 0.75, combine_sent))
                    limit = min(len(filtered5),5)
                    if limit != 0:
                        source = "threshold simscore 0.75"
                        filtered5 = filtered5[:limit]
                        sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered5))
                    else:
                        filtered6 = list(filter(lambda elems: elems[0] >= 0.7, combine_sent))
                        limit = min(len(filtered6),5)
                        if limit != 0:
                            source = "threshold simscore 0.7"
                            filtered6 = filtered6[:limit]
                            sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered6))
                        else:
                            filtered7 = list(filter(lambda elems: elems[0] >= 0.6, combine_sent))
                            limit = min(len(filtered7),5)
                            if limit != 0:
                                source = "threshold simscore 0.6"
                                filtered7 = filtered7[:limit]
                                sim_score, evidence_list, sentence_list, weight_list = list(zip(*filtered7))

                            else:
                                source = "highest simscore"
                                combine_sent = combine_sent[:5]
                                sim_score, evidence_list, sentence_list, weight_list = list(zip(*combine_sent))

#     print(evidence_list)
    for idx in range(len(sentence_list)):
#         print(sim_score[idx], evidence_list[idx])
       
        counter_norm = normalize_counter(sentence_list[idx])  
        features_dev[i].update(counter_norm)   
        
    counter_claim = normalize_counter(norm_query) 
    features_dev[i].update(counter_claim)
    evidence_dev.append(evidence_list)
    evidence_weight.append(list(set(weight_list)))
    print(i, "doc:",len(results), "IR:",len_ir, "-",source, "-","entity:", len_entity, "title:",len_title, "evidence:",len(evidence_list))    
#     print("")


In [46]:
dclaim_index = list(features_dev.keys())
dlabel_list = []
for i in dclaim_index:
    dlabel_list.append(dlabel[i])
    
print(len(features_dev))
print(len(dclaim_index))
print(len(dlabel_list))
print(len(evidence_dev))

dev_data2 = vectorizer2.transform(feature_list_dev)
dev_matrix_tfidf2 = transformer.fit_transform(dev_data2)

predictions5 = clf_300.predict(dev_matrix_tfidf2)
#predictions
print(accuracy_score(dlabel_list,predictions5))
print(classification_report(dlabel_list,predictions5))

dev_set_predictions21 = {}
for i in range(len(drowid)):
    dev_set_predictions21[drowid[i]] = {"claim": dclaim[i], "label":predictions5[i], "evidence":evidence_dev[i]}
with open('dev_set_predictions21.json', 'w') as f:
    json.dump(dev_set_predictions21, f)

5001
5001
5001
5001
