In [None]:
import pandas as pd
import numpy as np
import math
import csv
import string
import operator
from sklearn import svm
from sklearn.svm import SVC
from sklearn import datasets
from nltk.corpus import stopwords
import nltk
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_decision_regions
from matplotlib.pyplot import figure
from collections import defaultdict

In [None]:
# Global Variables
inverted_index = {}
weight_dict = {}
train_label_count = {}
word_idf_scores = {}
threshold = 10
doc_count = 0
train_doc_ids = []
whole_body = []
testing_labels = ["Treatment", "Diagnosis", "Prevention", "Mechanism", "Transmission", "Epidemic Forecasting", "Case Report"]

In [None]:
# Reading Document and Stopwords
df = pd.read_csv('BC7-LitCovid-Train.csv')
stopwords = stopwords.words('english')

In [None]:
# Preprocessing the Documents
df["label"] = df['label'].str.lower()
df["title"] = df['title'].str.lower()
df["abstract"] = df['abstract'].str.lower()

df["title"] = df['title'].str.replace('[^\w\s]','')
df["abstract"] = df['abstract'].str.replace('[^\w\s]','')

df["label"] = df['label'].str.replace('\d+', '')
df['title'] = df['title'].str.replace('\d+', '')
df['abstract'] = df['abstract'].str.replace('\d+', '')

df["title"] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
df["abstract"] = df['abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

In [None]:
# Train Dev spilitting
X_train, X_dev, y_train, y_dev = train_test_split(df[['pmid', 'title', 'abstract']], df[['pmid', 'label']],  test_size=0.2)

In [None]:
# Keeping document counts
doc_count = len(X_train.index)
dev_doc_count = len(X_dev.index)

In [None]:
# Splitting labels and sort index
y_train = y_train.sort_index()
y_dev = y_dev.sort_index()

y_train['label'] = y_train['label'].str.split(';')
y_dev['label'] = y_dev['label'].str.split(';')

In [None]:
# Calculating number of labels
for label in y_train['label']:
    for item in label:
        if item in train_label_count:
            train_label_count[item]+=1
        else:
            train_label_count[item]={}
            train_label_count[item]=1
train_label_count = {k: v for k, v in sorted(train_label_count.items(), key=lambda item: item[1], reverse=True)}

In [None]:
# Occurences of training labels
labels = list(train_label_count.keys())
occurrences = list(train_label_count.values())

plt.bar(range(len(train_label_count)), occurrences, tick_label=labels)

plt.title("# of occurrences of labels in the training data")
plt.xlabel("Labels")
plt.ylabel("Occurrences")

fig = plt.gcf()
fig.set_size_inches(12.5, 7.5)

plt.show()

In [None]:
# Sort processing data index
X_train = X_train.sort_index()
X_dev = X_dev.sort_index()

In [None]:
# Keeping document IDs
train_doc_ids = X_train["pmid"]
dev_doc_ids = X_dev["pmid"]

In [None]:
# Combining informational data
X_train['title/abstract'] = X_train['title'] + " " + X_train['abstract']
X_dev['title/abstract'] = X_dev['title'] + " " + X_dev['abstract']

In [None]:
# Splitting data to get all informational words
X_train['title/abstract'] = [item.split() for item in X_train['title/abstract']]
X_dev['title/abstract'] = [item.split() for item in X_dev['title/abstract']]

In [None]:
# Getting the whole words
for i in X_train['title/abstract']:
    whole_body += i

In [None]:
# Getting the most common words
whole_body_copy = whole_body.copy()
common_words = []
common_words_occ = []
for i in range(1,11):
    most_common, num_most_common = Counter(whole_body_copy).most_common(1)[0]
    common_words.append(most_common)
    common_words_occ.append(num_most_common)
    whole_body_copy = list(filter(lambda a: a != most_common, whole_body_copy))

In [None]:
# Occurences of training words
plt.bar(range(len(common_words)), common_words_occ, tick_label=common_words)

plt.title("10 Most Common Words")
plt.xlabel("Words")
plt.ylabel("Occurrence")

fig = plt.gcf()
fig.set_size_inches(12.5, 7.5)

plt.show()

In [None]:
# Counting the occurences of words
vocab_keys = Counter(whole_body).keys()
vocab_values = Counter(whole_body).values()

In [None]:
# function that returns a dictionary which is the inverted version of previous one
# this dictionary has indexes as words not ids
def inverted_index_creation(array):
    new_dictionary = {}
    for idx in array.keys():
        ids = train_doc_ids[idx]
        for word in array[idx]:
            if(word not in new_dictionary):
                dct = {}
                dct[ids] = 1
                new_dictionary[word] = dct
            else:
                current_word_counts = new_dictionary[word]
                if(ids in current_word_counts):
                    current_word_counts[ids] += 1
                else:
                    current_word_counts[ids] = 1
                new_dictionary[word] = current_word_counts
    for w in list(new_dictionary.keys()):
        if len(new_dictionary[w]) < threshold:
            del new_dictionary[w]
    return new_dictionary

In [None]:
def calculate_tf(query_word, query):
    if query_word in query:
        return (1 + np.log10(query.count(query_word)))
    else:
        return 0

In [None]:
# function that has parameters query word and returns the idf value of it
def calculate_idf(query_word):
    if(query_word in inverted_index):
        document_freq = len(inverted_index[query_word])
        return np.log10(np.divide(doc_count, document_freq))
    else:
        return 0

In [None]:
# function that has a parameter as dictionary that has key as document id and value as words
# returns a dictionary that has key as document id and value
def calc_tfidf(bodies):
    weigth_dict = {}
    for idx in bodies.keys():
        dict_tfidf_value = {}
        for word in inverted_index:
            if word not in dict_tfidf_value:
                dict_tfidf_value[word] = np.multiply(word_idf_scores[word], calculate_tf(word, bodies[idx]))
        weigth_dict[idx] = dict_tfidf_value
    return weigth_dict

In [None]:
# Creating an Inverted Index Dictionary
inverted_index = inverted_index_creation(X_train['title/abstract'])

In [None]:
# Creating an Inverted Index DataFrame
inverted_index_df = pd.DataFrame.from_dict(inverted_index, orient='index')

In [None]:
# Keeping all idf scores of words in Inverted Index
word_idf_scores = {word: calculate_idf(word) for word in inverted_index}

In [None]:
# Calculating tfidf weigths of training words
weight_dict = calc_tfidf(X_train['title/abstract'])

In [None]:
# Keeping all tfidf scores of training words in DataFrame
weight_df = pd.DataFrame.from_dict(weight_dict, orient='index')

In [None]:
# Calculating tfidf weigths of Development words
weight_dict_dev = calc_tfidf(X_dev['title/abstract'])

In [None]:
# Keeping all tfidf scores of Development words in DataFrame
weight_df_dev = pd.DataFrame.from_dict(weight_dict_dev, orient='index')

In [None]:
# Nan values of training DataFrame changes to 0 and sort index
weight_df = weight_df.fillna(0)
weight_df = weight_df.sort_index()

In [None]:
# Nan values of Development DataFrame changes to 0 and sort index
weight_df_dev = weight_df_dev.fillna(0)
weight_df_dev = weight_df_dev.sort_index()

In [None]:
# Creating OneVsRest Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=0, max_iter=10000))

In [None]:
# Binarizing the labels and fitting
mlb = MultiLabelBinarizer()
multilabel_y = mlb.fit_transform(y_train['label'])

In [None]:
# Creating a model for the classifier
model = classifier.fit(weight_df, multilabel_y)

In [None]:
# Predicting the Development data using the model
pred = model.predict(weight_df_dev)

In [None]:
pred_inv = mlb.inverse_transform(pred)

In [None]:
actual = mlb.fit_transform(y_dev['label'])
actual_inv = mlb.inverse_transform(actual)

In [None]:
actual_result_df = pd.DataFrame(actual, columns = sorted(testing_labels))

In [None]:
pred_result_df = pd.DataFrame(pred, columns = sorted(testing_labels))

In [None]:
pred_result_df.to_csv('pred_result_df.csv', sep =',')

In [None]:
actual_result_df.to_csv('actual_result_df.csv', sep =',')

In [None]:
cm = multilabel_confusion_matrix(actual, pred)
print(cm)
print(classification_report(actual, pred, target_names = sorted(testing_labels)))

In [None]:
def co_occurrence(pred_labels, label_set):
    d = defaultdict(int)
    vocab = set()
    for label in label_set:
        vocab.add(label)
    for labels in pred_labels:
        # iterate over sentences
        for i in range(len(labels)):
            token = labels[i]
            next_token = labels[i+1 : ]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df.values.tolist()

In [None]:
all_labels = ["treatment", "diagnosis", "prevention", "mechanism", "transmission", "epidemic forecasting", "case report"]

In [None]:
co_occurrence_matrix = co_occurrence(pred_inv, all_labels)

In [None]:
# Chord(co_occurrence_matrix, all_labels).show()

# Combining train and dev data

In [None]:
X_train = X_train.append(X_dev)
y_train = y_train.append(y_dev)

In [None]:
X_train = X_train.sort_index()
y_train = y_train.sort_index()

In [None]:
train_doc_ids = X_train["pmid"]

In [None]:
# Creating an Inverted Index Dictionary
inverted_index = inverted_index_creation(X_train['title/abstract'])

In [None]:
# Creating an Inverted Index DataFrame
inverted_index_df = pd.DataFrame.from_dict(inverted_index, orient='index')

In [None]:
# Keeping all idf scores of words in Inverted Index
word_idf_scores = {word: calculate_idf(word) for word in inverted_index}

In [None]:
# Calculating tfidf weigths of training words
weight_dict = calc_tfidf(X_train['title/abstract'])

In [None]:
# Keeping all tfidf scores of training words in DataFrame
weight_df = pd.DataFrame.from_dict(weight_dict, orient='index')

In [None]:
# Nan values of training DataFrame changes to 0 and sort index
weight_df = weight_df.fillna(0)
weight_df = weight_df.sort_index()

In [None]:
# Creating OneVsRest Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=0, max_iter=100000))

In [None]:
# Binarizing the labels and fitting
mlb = MultiLabelBinarizer()
multilabel_y = mlb.fit_transform(y_train['label'])

In [None]:
# Creating a model for the classifier
model = classifier.fit(weight_df, multilabel_y)

# Testing the model


In [None]:
# Reading Document and Stopwords
df = pd.read_csv('BC7-LitCovid-Dev.csv')

# Preprocessing the Documents
df["label"] = df['label'].str.lower()
df["title"] = df['title'].str.lower()
df["abstract"] = df['abstract'].str.lower()

df["title"] = df['title'].str.replace('[^\w\s]','')
df["abstract"] = df['abstract'].str.replace('[^\w\s]','')

df["label"] = df['label'].str.replace('\d+', '')
df['title'] = df['title'].str.replace('\d+', '')
df['abstract'] = df['abstract'].str.replace('\d+', '')

df["title"] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
df["abstract"] = df['abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

# Train Dev spilitting
X_test = pd.DataFrame()
X_test['title/abstract'] = df['title'] + " " + df['abstract']
X_test['title/abstract'] = [item.split() for item in X_test['title/abstract']]
y_test = df[['pmid', 'label']]
y_test = y_test.sort_index()
y_test['label'] = y_test['label'].str.split(';')

test_label_count = {}
# Calculating number of labels
for label in y_test['label']:
    for item in label:
        if item in test_label_count:
            test_label_count[item]+=1
        else:
            test_label_count[item]={}
            test_label_count[item]=1
test_label_count = {k: v for k, v in sorted(test_label_count.items(), key=lambda item: item[1], reverse=True)}

In [None]:
# Occurences of training labels
labels = list(test_label_count.keys())
occurrences = list(test_label_count.values())

plt.bar(range(len(test_label_count)), occurrences, tick_label=labels)

plt.title("# of occurrences of labels in the test data")
plt.xlabel("Labels")
plt.ylabel("Occurrences")

fig = plt.gcf()
fig.set_size_inches(12.5, 7.5)

plt.show()

In [None]:
# Getting the whole words
whole_body = []
for i in X_test['title/abstract']:
    whole_body += i

In [None]:
whole_body_copy = whole_body.copy()
common_words = []
common_words_occ = []
for i in range(1,11):
    most_common, num_most_common = Counter(whole_body_copy).most_common(1)[0]
    common_words.append(most_common)
    common_words_occ.append(num_most_common)
    whole_body_copy = list(filter(lambda a: a != most_common, whole_body_copy))

In [None]:
# Occurences of training words
plt.bar(range(len(common_words)), common_words_occ, tick_label=common_words)

plt.title("10 Most Common Words")
plt.xlabel("Words")
plt.ylabel("Occurence")

fig = plt.gcf()
fig.set_size_inches(12.5, 7.5)

plt.show()

In [None]:
# Counting the occurences of words
vocab_keys = Counter(whole_body).keys()
vocab_values = Counter(whole_body).values()

In [None]:
# Calculating tfidf weigths of Development words
weight_dict_test = calc_tfidf(X_test['title/abstract'])

In [None]:
# Keeping all tfidf scores of Development words in DataFrame
weight_df_test = pd.DataFrame.from_dict(weight_dict_test, orient='index')

In [None]:
# Nan values of Development DataFrame changes to 0 and sort index
weight_df_test = weight_df_test.fillna(0)
weight_df_test = weight_df_test.sort_index()

In [None]:
# Predicting the Development data using the model
pred = model.predict(weight_df_test)

In [None]:
actual = mlb.fit_transform(y_test['label'])
actual_inv = mlb.inverse_transform(actual)

In [None]:
cm = multilabel_confusion_matrix(actual, pred)
print(cm)
print(classification_report(actual, pred, target_names = sorted(testing_labels)))