In [1]:
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from score import report_score
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.callbacks import EarlyStopping



FIELDNAMES = ['Headline', 'Body ID', 'Stance']
Labels = ['agree', 'disagree', 'discuss', 'unrelated']

def read_bodies(path):
    df = pd.read_csv(path)
    print(df.head)
    Dict = {}
    for index, row in df.iterrows():
        Dict[int(row['Body ID'])] = index
    return df,Dict

def read_stances(path):
    df = pd.read_csv(path)
    di = { Labels[0]:0, Labels[1]:1, Labels[2]:2, Labels[3]:3 }
    df['Stance'].replace(di, inplace=True)
    print(df.head)
    return df

trainBodiesPath = "../fnc-1-baseline-master/fnc-1/train_bodies.csv"
trainStancesPath = "../fnc-1-baseline-master/fnc-1/train_stances.csv"
testBodiesPath = "../fnc-1-baseline-master/fnc-1/competition_test_bodies.csv"
testStancesPath = "../fnc-1-baseline-master/fnc-1/competition_test_stances.csv"
train_articles_df,train_dic_articleId_index = read_bodies(trainBodiesPath)
train_stances_df = read_stances(trainStancesPath)
train_labels = train_stances_df[['Stance']].values

test_articles_df,test_dic_articleId_index = read_bodies(testBodiesPath)
test_stances_df = read_stances(testStancesPath)
test_labels = test_stances_df[['Stance']].values

<bound method NDFrame.head of       Body ID                                        articleBody
0           0  A small meteorite crashed into a wooded area i...
1           4  Last week we hinted at what was to come as Ebo...
2           5  (NEWSER) – Wonder how long a Quarter Pounder w...
3           6  Posting photos of a gun-toting child online, I...
4           7  At least 25 suspected Boko Haram insurgents we...
...       ...                                                ...
1678     2528  Intelligence agencies hunting for identity of ...
1679     2529  While Daleks "know no fear" and "must not fear...
1680     2530  More than 200 schoolgirls were kidnapped in Ap...
1681     2531  A Guantanamo Bay prisoner released last year a...
1682     2532  ANN ARBOR, Mich. – A pizza delivery man in Mic...

[1683 rows x 2 columns]>
<bound method NDFrame.head of                                                 Headline  Body ID  Stance
0      Police find mass graves with at least '15 bodi...    

In [2]:
stopWords = [
        "a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along",
        "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
        "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be",
        "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
        "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "co",
        "con", "could", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight",
        "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
        "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for",
        "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had",
        "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
        "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest",
        "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made",
        "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much",
        "must", "my", "myself", "name", "namely", "neither", "nevertheless", "next", "nine", "nobody", "now", "nowhere",
        "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours",
        "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see",
        "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some",
        "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take",
        "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby",
        "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though",
        "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve",
        "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what",
        "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon",
        "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
        "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
        ]

train_article_list = []
test_article_list = []

for index, row in train_stances_df.iterrows():
    train_article_list.append(train_articles_df.iloc[train_dic_articleId_index[int(row['Body ID'])]]['articleBody'])
    
for index, row in test_stances_df.iterrows():
    test_article_list.append(test_articles_df.iloc[test_dic_articleId_index[int(row['Body ID'])]]['articleBody'])
    
train_headlines_list = train_stances_df['Headline'].to_list()
test_headlines_list = test_stances_df['Headline'].to_list()


tf_vec = CountVectorizer(stop_words = stopWords, ngram_range = (1,1), max_features = 5000)
tf_vec.fit(train_article_list + train_headlines_list)

tfidf_vec = TfidfVectorizer(stop_words = stopWords, ngram_range = (1,1), max_features = 5000, norm = 'l2')
tfidf_vec.fit(train_article_list + train_headlines_list + test_article_list + test_headlines_list)

train_hline_tf = tf_vec.transform(train_headlines_list)
test_hline_tf = tf_vec.transform(test_headlines_list)



train_hline_tfidf = tfidf_vec.transform(train_headlines_list)
test_hline_tfidf = tfidf_vec.transform(test_headlines_list)



print(train_hline_tf.shape)
print(test_hline_tf.shape)
print(train_hline_tfidf.shape)
print(test_hline_tfidf.shape)






train_article_list_tf = tf_vec.transform(train_article_list)
test_article_list_tf = tf_vec.transform(test_article_list)

train_article_list_tfidf = tfidf_vec.transform(train_article_list)
test_article_list_tfidf = tfidf_vec.transform(test_article_list)
print(train_article_list_tf.shape)
print(test_article_list_tf.shape)
print(train_article_list_tfidf.shape)
print(test_article_list_tfidf.shape)

In [4]:
cos_similarity_train = np.zeros(shape=(train_hline_tfidf.shape[0],1))
for x in range(train_hline_tfidf.shape[0]):
    A = csr_matrix.todense(train_hline_tfidf[x])
    B = csr_matrix.todense(train_article_list_tfidf[x])
    #print(train_hline_tf[x])
    #cos_similarity_train[x] = np.dot(A,B.T)
    cos_similarity_train[x] = cosine_similarity(A, B)[0]
print(A)
print(B)
print(cos_similarity_train[:5])

cos_similarity_test = np.zeros(shape=(test_hline_tfidf.shape[0],1))
for x in range(test_hline_tfidf.shape[0]):
    A = csr_matrix.todense(test_hline_tfidf[x])
    B = csr_matrix.todense(test_article_list_tfidf[x])
    #print(train_hline_tf[x])
    #cos_similarity_test[x] = np.dot(A,B.T)
    cos_similarity_test[x] = cosine_similarity(A, B)[0]

print(cos_similarity_test[:5])

[[0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]]
[[0.        ]
 [0.47385964]
 [0.        ]
 [0.        ]
 [0.19934658]]
[[0.        ]
 [0.        ]
 [0.01111459]
 [0.        ]
 [0.        ]]


In [5]:
A_mag = np.linalg.norm(A)
B_mag = np.linalg.norm(B)
print(A_mag)
print(B_mag)

1.0
1.0000000000000002


In [6]:
final_train_dense = np.zeros(shape=(train_hline_tfidf.shape[0],10001))
final_test_dense = np.zeros(shape=(test_hline_tfidf.shape[0],10001))

for x in range(train_hline_tfidf.shape[0]):
    A = csr_matrix.todense(train_hline_tfidf[x])
    B = csr_matrix.todense(train_article_list_tfidf[x])
    C = cos_similarity_train[x]
    row = np.squeeze(np.c_[A, B, C])
    final_train_dense[x] = row
    
print(final_train_dense.shape)


for x in range(test_hline_tfidf.shape[0]):
    A = csr_matrix.todense(test_hline_tfidf[x])
    B = csr_matrix.todense(test_article_list_tfidf[x])
    C = cos_similarity_test[x]
    row = np.squeeze(np.c_[A, B, C])
    final_test_dense[x] = row
     
print(final_test_dense.shape)

(49972, 10001)
(25413, 10001)


In [7]:
print(train_labels.shape)
print(test_labels.shape)

print(train_labels)
train_encoder = OneHotEncoder(sparse=False)
train_labels = train_encoder.fit_transform(train_labels)
print(train_labels)


print(test_labels)
test_encoder = OneHotEncoder(sparse=False)
test_labels = test_encoder.fit_transform(test_labels)
print(test_labels)

(49972, 1)
(25413, 1)
[[3]
 [0]
 [3]
 ...
 [0]
 [2]
 [3]]
[[0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
[[3]
 [3]
 [3]
 ...
 [1]
 [1]
 [0]]
[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 ...
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [8]:
model = keras.Sequential()
model.add(layers.Dense(100,input_dim=final_train_dense.shape[1], activation="relu"))
model.add(layers.Dropout(0.6))
model.add(layers.BatchNormalization())
model.add(layers.Dense(4, activation="softmax"))
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               1000200   
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 100)               400       
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 404       
Total params: 1,001,004
Trainable params: 1,000,804
Non-trainable params: 200
_________________________________________________________________


In [9]:
#ES_callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15)

#model.fit(final_train_dense, train_labels, epochs=90, batch_size=500, validation_split = 0.1, callbacks=[ES_callback])
model.fit(final_train_dense, train_labels, epochs=90, batch_size=500, validation_split = 0.1)
loss, accuracy = model.evaluate(final_test_dense, test_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90


Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 83/90
Epoch 84/90
Epoch 85/90
Epoch 86/90
Epoch 87/90
Epoch 88/90
Epoch 89/90
Epoch 90/90
Accuracy: 87.026328


In [10]:
ynew = model.predict_classes(final_test_dense)
print(ynew[:10])
predicted = [Labels[int(a)] for a in ynew]
print(predicted[:10])
y = test_stances_df['Stance'].tolist()
actual = [Labels[int(a)] for a in y]
print(actual[:10])
report_score(actual,predicted)


target_names = ['agree', 'disagree', 'discuss', 'unrelated']
print(classification_report(y, ynew, target_names=target_names))


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
[3 3 3 3 3 3 3 3 3 3]
['unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated']
['unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated']
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    735    |     0     |    983    |    185    |
-------------------------------------------------------------
| disagree  |    250    |     1     |    312    |    134    |
----------

In [11]:
# df = pd.read_csv(testStancesPath)
# df = df.drop('Stance', 1)
# predicted_df = pd.DataFrame({'Stance':predicted})
# df_col = predicted_df["Stance"]
# test_data = df.join(df_col)
# test_data.to_csv('answer_9358_correct_tfidf.csv', index=False, encoding='utf-8') # From pandas library
# print(test_data.head())

In [12]:
# model.save('/Users/rajbir/Downloads/Personal_MSCI641/Project FNC/fnc_stance_detection/Test_Implementation_1/')