Step 3.	Suggest a script that uses tf-idf vectorizer of genism to calculate the similarity between the topic title and the abstract. Calculate the mean and standard deviation over all abstracts associated to the same topic. Provide the result in a table. 

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import xlsxwriter

In [2]:
# Initialize database

db_file = "./journal-database/database100_preprocessed.xlsx"

In [3]:
topics = {
  "mk": "Musculoskeletal Radiology",
  "ct": "Computed Tomography",
  "br": "Breast Imaging",
  "gu": "Geritourinary Radiology",
  "us": "Ultrasound",
  "ch": "Chest Radiology",
  "ir": "Interventional Radiology",
}

In [4]:
vectorizer = TfidfVectorizer(stop_words='english') # initialize vectorizer
similarity_values_abst = {}

for topic in topics:
    sheet = pd.read_excel(db_file, sheet_name=topic) # open correct sheet
    similarity_values_abst[topic]=[] # init an empty array within the dictionary
    for abstract in sheet.abstract:

        content = [topics[topic]]                                   # create the
        content.append(abstract)                                    # topic title/abstract pair

        vectors = vectorizer.fit_transform(content)                 # learn vocabulary and idf, return document-term matrix

        cosine_sim_matrix = cosine_similarity(vectors, vectors)     # calculate cosine similarity

        similarity_values_abst[topic].append(cosine_sim_matrix[0][1])    # append calculated similarity value to list

mean_and_sd_abst = {}

# calculate mean and sd for each topic
for topic in similarity_values_abst:
    mean = np.mean(similarity_values_abst[topic])
    sd = np.std(similarity_values_abst[topic])
    mean_and_sd_abst[topic + " (mean, sd)"]=tuple((mean, sd))


Step 4. Repeat 3) when considering the title of the document instead of the abstract. 

In [5]:
vectorizer = TfidfVectorizer(stop_words='english') # initialize vectorizer
similarity_values_title = {}

for topic in topics:
    sheet = pd.read_excel(db_file, sheet_name=topic) # open correct sheet
    similarity_values_title[topic]=[] # init an empty array within the dictionary
    for title in sheet.title:

        content = [topics[topic]]                                   # create the
        content.append(title)                                    # topic title/abstract pair

        vectors = vectorizer.fit_transform(content)                 # learn vocabulary and idf, return document-term matrix

        cosine_sim_matrix = cosine_similarity(vectors, vectors)     # calculate cosine similarity

        similarity_values_title[topic].append(cosine_sim_matrix[0][1])    # append calculated similarity value to list

mean_and_sd_title = {}

# calculate mean and sd for each topic
for topic in similarity_values_title:
    mean = np.mean(similarity_values_title[topic])
    sd = np.std(similarity_values_title[topic])
    mean_and_sd_title[topic + " (mean, sd)"]=tuple((mean, sd))


Create table of the results from step 3. and step 4.

In [6]:
# Similarity values of topic title and abstract to Excel table

workbook = xlsxwriter.Workbook("similarity_table_abst.xlsx")
worksheet = workbook.add_worksheet("table")
worksheet.write(0, 1, 'Mean of similarity')
worksheet.write(0, 2, 'Standard deviation (SD) of similarity')
worksheet.write(0, 3, 'Sample size')

for i, topic in enumerate(topics):
    str = topics[topic] + " " + "(" + topic + ")"
    worksheet.write(i+1, 0, str)
    sheet = pd.read_excel(db_file, sheet_name=topic) # open correct sheet
    worksheet.write_number(i+1, 3, len(sheet))

for j, stat in enumerate(mean_and_sd_abst):
    worksheet.write_number(j+1, 1, mean_and_sd_abst[stat][0])
    worksheet.write_number(j+1, 2, mean_and_sd_abst[stat][1])

workbook.close()

In [7]:
# Similarity values of topic title and document title to Excel table

workbook = xlsxwriter.Workbook("similarity_table_title.xlsx")
worksheet = workbook.add_worksheet("table")
worksheet.write(0, 1, 'Mean of similarity')
worksheet.write(0, 2, 'Standard deviation (SD) of similarity')
worksheet.write(0, 3, 'Sample size')

for i, topic in enumerate(topics):
    str = topics[topic] + " " + "(" + topic + ")"
    worksheet.write(i+1, 0, str)
    sheet = pd.read_excel(db_file, sheet_name=topic) # open correct sheet
    worksheet.write_number(i+1, 3, len(sheet))

for j, stat in enumerate(mean_and_sd_title):
    worksheet.write_number(j+1, 1, mean_and_sd_title[stat][0])
    worksheet.write_number(j+1, 2, mean_and_sd_title[stat][1])

workbook.close()