In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, precision_recall_curve

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from sklearn.manifold import TSNE

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
import ast

In [None]:
from timeit import default_timer as timer
from datetime import timedelta

In [None]:
train_papers = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/train/train_papers.csv')
dev_papers = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/dev/dev_papers.csv')

In [None]:
train_cases_data = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/train/train_cases.csv')
dev_cases_data = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/train/dev_cases.csv')

In [None]:
train_cases_data['ChallengeDescription_stemmed_joined'] = train_cases_data['ChallengeDescription_stemmed'].apply(lambda x: ' '.join(x))
dev_cases_data['ChallengeDescription_stemmed_joined'] = dev_cases_data['ChallengeDescription_stemmed'].apply(lambda x: ' '.join(x))
train_papers['abstract_stemmed_joined'] = train_papers['abstract_stemmed'].apply(lambda x: ' '.join(x))
dev_papers['abstract_stemmed_joined'] = dev_papers['abstract_stemmed'].apply(lambda x: ' '.join(x))

In [None]:
X_corpus = train_cases_data['ChallengeDescription_stemmed_joined'].tolist() + train_papers['abstract_stemmed_joined'].tolist()

In [None]:
len(X_corpus)

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
start = timer()
vectorizer.fit(X_corpus)
end = timer()
print(f"total training time: {timedelta(seconds=end-start)}")

In [None]:
def vectorize_paper(stemmed_abstract):
  return vectorizer.transform([stemmed_abstract])

In [None]:
def get_description(case_id):
  return cases_data[cases_data['CaseID'] == case_id].iloc[0]['ChallengeDescription_stemmed_joined']

In [None]:
def get_case_papers(case_id):
  return train_papers[train_papers['case_id'] == case_id][['paper_id', 'type', 'abstract_stemmed_joined']]

In [None]:
def get_paper(paper_id):
  return train_papers[train_papers['paper_id'] == paper_id].iloc[0]['abstract_stemmed_joined']

In [None]:
def check_embedding(case_id, paper_id, vecorizer):
  description = get_description(case_id)
  paper_abstract = get_paper(paper_id)

  description_embedding = vectorizer.transform([description])
  abstract_embedding = vectorizer.transform([paper_abstract])
  return cosine_similarity(description_embedding, abstract_embedding).flatten()

In [None]:
def get_case_embedding(df, case_id):
  return df[df['CaseID'] == case_id].iloc[0]['embedding']

In [None]:
def get_similarity_with_case_descriptions(df, embedding, case_id):
  case_embedding = get_case_embedding(df, case_id)
  return cosine_similarity(case_embedding, embedding).flatten()[0]

In [None]:
def plot_cm(target_y, pred_y, labels):
  cm = confusion_matrix(target_y, pred_y, labels=labels)
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot()
  plt.show()

In [None]:
def convert_y_to_label(y):
  labels = []
  for item in y:
    if item:
      labels.append('associated_paper')
    else:
      labels.append('irrelevant_paper')
  return labels

# Train set eval

In [None]:
train_cases_data['embedding'] = train_cases_data['ChallengeDescription_stemmed_joined'].progress_apply(lambda x: vectorizer.transform([x]))

In [None]:
train_papers['embedding'] = train_papers['abstract_stemmed_joined'].progress_apply(lambda x: vectorizer.transform([x]))
train_papers['case_embedding_similarity'] = train_papers.progress_apply(lambda x: get_similarity_with_case_descriptions(train_cases_data, x.embedding, x.case_id), axis=1)

In [None]:
train_papers['case_embedding_similarity'] = train_papers.progress_apply(lambda x: get_similarity_with_case_descriptions(train_cases_data, x.embedding, x.case_id), axis=1)

In [None]:
precision, recall, thresholds = precision_recall_curve(train_papers['type'] == 'associated_paper', train_papers['case_embedding_similarity'])
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
optimal_threshold = thresholds[ix]
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))

plt.plot(recall, precision)
plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
# show the plot
plt.show()

In [None]:
train_papers['pred_is_associated'] = train_papers['case_embedding_similarity'].apply(lambda x: x >= optimal_threshold)
y_target = convert_y_to_label(train_papers['type'] == 'associated_paper')
y_pred = convert_y_to_label(train_papers['pred_is_associated'])
plot_cm(y_target, y_pred, ['associated_paper', 'irrelevant_paper'])
print(classification_report(y_target, y_pred, digits=4))

# Dev set eval

In [None]:
dev_cases_data['embedding'] = dev_cases_data['ChallengeDescription_stemmed_joined'].progress_apply(lambda x: vectorizer.transform([x]))

In [None]:
dev_papers['embedding'] = dev_papers['abstract_stemmed_joined'].progress_apply(lambda x: vectorizer.transform([x]))

In [None]:
dev_papers['case_embedding_similarity'] = dev_papers.progress_apply(lambda x: get_similarity_with_case_descriptions(dev_cases_data, x.embedding, x.case_id), axis=1)

In [None]:
dev_papers['case_embedding_similarity'].describe()

In [None]:
dev_papers[dev_papers['type'] == 'associated_paper']['case_embedding_similarity'].describe()

In [None]:
dev_papers[dev_papers['type'] == 'semirelevant_papers']['case_embedding_similarity'].describe()

In [None]:
dev_papers[dev_papers['type'] == 'irrelevant_papers']['case_embedding_similarity'].describe()

In [None]:
from yellowbrick.text import TSNEVisualizer

In [None]:
case = dev_papers[dev_papers['case_id'] == 64]
X_case = vectorizer.transform(case['abstract_stemmed_joined'].tolist())
y_case = case['type'].tolist()
case_tsne = TSNEVisualizer()
case_tsne.fit(X_case, y_case)
case_tsne.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(dev_papers['type'] == 'associated_paper', dev_papers['case_embedding_similarity'])
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
optimal_threshold = thresholds[ix]
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))

plt.plot(recall, precision)
plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
# show the plot
plt.show()

In [None]:
dev_papers['pred_is_associated'] = dev_papers['case_embedding_similarity'].apply(lambda x: x >= optimal_threshold)

In [None]:
y_target = convert_y_to_label(dev_papers['type'] == 'associated_paper')
y_pred = convert_y_to_label(dev_papers['pred_is_associated'])

In [None]:
plot_cm(y_target, y_pred, ['associated_paper', 'irrelevant_paper'])

In [None]:
print(classification_report(y_target, y_pred, digits=4))

# Test set

In [None]:
test_papers = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/test/test_papers.csv')
test_cases_data = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/test/test_cases.csv')
test_cases_data['ChallengeDescription_stemmed'] = test_cases_data['ChallengeDescription_stemmed'].progress_apply(ast.literal_eval)
test_papers['abstract_stemmed_joined'] = test_papers['abstract_stemmed'].apply(lambda x: ' '.join(x))

In [None]:
test_cases_data['ChallengeDescription_stemmed_joined'] = test_cases_data['ChallengeDescription_stemmed'].apply(lambda x: ' '.join(x))

In [None]:
test_cases_data['embedding'] = test_cases_data['ChallengeDescription_stemmed_joined'].progress_apply(lambda x: vectorizer.transform([x]))

In [None]:
test_papers['embedding'] = test_papers['abstract_stemmed_joined'].progress_apply(lambda x: vectorizer.transform([x]))

In [None]:
test_papers['case_embedding'] = test_papers.progress_apply(lambda x: get_case_embedding(test_cases_data, x.case_id), axis=1)

In [None]:
test_papers['case_embedding_similarity'] = test_papers.progress_apply(lambda x: cosine_similarity(x['case_embedding'], x['embedding'])[0][0], axis=1)

In [None]:
test_papers['pred_is_associated'] = test_papers['case_embedding_similarity'].apply(lambda x: x >= optimal_threshold)

In [None]:
y_test_target = convert_y_to_label(test_papers['type'] == 'associated_paper')
y_test_pred = convert_y_to_label(test_papers['pred_is_associated'])
plot_cm(y_test_target, y_test_pred, ['associated_paper', 'irrelevant_paper'])
print(classification_report(y_test_target, y_test_pred, digits=4))