In [None]:
import pandas as pd
import numpy as np
np.random.seed(1) #setting random seed for reproducibility


In [None]:
from google.colab import files
uploaded = files.upload()

Saving items.csv to items.csv


In [None]:
from google.colab import files
uploaded = files.upload()

Saving evaluation.csv to evaluation.csv


In [None]:
item_df = pd.read_csv("items.csv", sep='|')
item_df['itemID'] = item_df.itemID.astype(str)

eval_df = pd.read_csv('evaluation.csv')
eval_df['itemID'] = eval_df.itemID.astype(str)

In [None]:
item_df['main topic'] = item_df['main topic'].astype(str)

In [None]:
## This row seems problematic, isn't it? ID 62676 is from Beatrice Harrison. We can remove this row cuz we do not have it in evaluation data.
#Comment by Frederic: I would keep it, as it could be a recommendation for another book. I'd suggest we just change the main topic to YXE here.
item_df = item_df[~(item_df['main topic'] == "[5PGM,YXE]")]

In [None]:
item_list = list(item_df.itemID.unique())
topic_list = list(item_df['main topic'].unique())

In [None]:
## Items which do not have main topic but included in evaluation data. I suggest to replace NA values with subtopics. How do you think about it?
item_df[(item_df['itemID'].isin(eval_df.itemID)) & (item_df['main topic'].isna())]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
182,6798,Kernstaub,Marie Graßhoff,Drachenmond Verlag,,[FM]
8959,23417,Die Herren der Unterwelt: Schwarzes Versprechen,Gena Showalter,MIRA Taschenbuch,,"[FM,FMR]"
15435,58820,"Star Trek, The Next Generation - Kalte Berechn...",David Mack,Cross Cult,,[FLS]
15448,69755,Q-World 2,Charles H. Barnes,Books on Demand,,[5AQ]


In [None]:
## Hard coding for rows which do not have main topic.
item_df.loc[item_df['itemID']=="6798", 'main topic'] = "FM"
item_df.loc[item_df['itemID']=="23417", 'main topic'] = "FMR"
item_df.loc[item_df['itemID']=="58820", 'main topic'] = "FLS"
item_df.loc[item_df['itemID']=="69755", 'main topic'] = "5AQ"

In [None]:
item_df = item_df[~item_df["main topic"].isna()]

In [None]:
#set itemID as index column of eval and item dfs
eval_df = eval_df.set_index("itemID")
item_df = item_df.set_index("itemID")

# Jee

In [None]:
## This function calculate similarity with pyramid distance. And we added some weight for the position of letters.
def get_pyramid_similarity(str1, str2):
    '''
    Arguments:
    str1: string
    str2: string
    Returns:
    similarity: "Pyramid" similarity between str1 and str2
    '''
    dist = 0
    for i in range(min(len(str1), len(str2))):
        if dist == 0:
            if str1[i] != str2[i]:
                dist += 2/(2**i)
        else:
            if str1[i-1] == str2[i-1]:
                if str1[i] != str2[i]:
                    dist += 2/(2**i)
            else:
                dist += 2/(2**i)

    if len(str1) != len(str2):
        for j in range(min(len(str1),len(str2)), max(len(str1),len(str2))):
            dist += 1/(2**j)
            
    similarity = round(1/(dist+1),2)
    if similarity <= 0.33:
        similarity = 0

    return similarity

# Frederic

In [None]:
def get_substring_similarity(str1, str2):
  '''
  Arguments:
    str1: string
    str2: string
  Returns:
    similarity: Substring similarity between str1 and str2
  '''
  similarity = 0
  substr_len = min(len(str1),len(str2))
  
  for i in range(substr_len):
    if str1[i] == str2[i]:
      similarity += 1

  return similarity

In [None]:
def get_topic_similarity_matrix(list_of_main_topics, method = 0):
  '''
  Arguments:
    list_of_main_topics: 
    method : If method == 0, which is default value, we will get substring similarity. Else, we will get pyramid similarity.
  
  Returns:
    Similarity_Matrix: Dictionary of Dictionaries. First level can be thought of as X-axis, second level can be thought of as Y-Axis,
    values within the dictionary return the similarity of the X-topic and the Y-topic.
    E.g. similarity_matrix["ABC"]["AB"] = 2, note symmetry: similarity_matrix["X"]["Y"] = similarity_matrix["Y"]["X"] 
  '''

  unique_main_topics = list(set(list_of_main_topics))
  
  topic_similarity_matrix = {}
  
  while unique_main_topics != []:
    base_topic = unique_main_topics.pop()
    topic_similarity_matrix[base_topic] = {base_topic : len(base_topic)} #set the similarity of a main topic to itself equal to the length of its code

    for comparison_topic in unique_main_topics:
      if method == 0: similarity = get_substring_similarity(base_topic, comparison_topic)
      else: similarity = get_pyramid_similarity(base_topic, comparison_topic)
      topic_similarity_matrix[base_topic][comparison_topic] = similarity
    
  return topic_similarity_matrix


# Main


In [None]:
def get_recommendations(eval_df, items_df, topic_similarity_method=0):
  '''
  Arguments:
    eval_df: Dataframe with evaluation items, where the itemID is the index of the dataframe
    items_df: Dataframe with all the items, where the itemID is the index of the dataframe
    method: If method is 0, which is default value, we will recommend with substring similarity. In other cases, we will recommend with pyramid similarity.

  Returns: 
    list_of_similar_books: 5 most similar books based on main topic
  '''
  list_of_evalIDs = eval_df.index.to_list()
  list_of_itemIDs = items_df.index.to_list()
  list_of_main_topics = items_df["main topic"].to_list()

  #initialize the matrix of recommendations and the list_of_candidateIDs
  recommendations_data = []
  list_of_candidateIDs = []

  #Get topic-similarity matrix to safe computing time
  list_of_unique_main_topics = list(set(list_of_main_topics))
  similarity_matrix = get_topic_similarity_matrix(list_of_main_topics, method=topic_similarity_method)

  #Main loop to obtain list_of_candidates for each eval_item in list_of_evalIDs
  for eval_itemID in list_of_evalIDs:

    #Make random choice in case eval_itemID is not in list_of_itemIDs
    if eval_itemID not in list_of_itemIDs:
      list_of_candidateIDs = list_of_itemIDs

    else:
      #Get list_of_candidates based on main topic similarity
      eval_item_main_topic = items_df.loc[eval_itemID]["main topic"]

      #Get list of similarity scores for each main topic
      list_of_similar_topics = []
      list_of_similarity_scores = []

      for item_main_topic in list_of_unique_main_topics:
        #This next if-statement step is needed because of the structure of the similarity matrix (it's a triangular matrix to save space)
        if len(similarity_matrix[item_main_topic]) > len(similarity_matrix[eval_item_main_topic]):
          similarity = similarity_matrix[item_main_topic][eval_item_main_topic]
        else:
          similarity = similarity_matrix[eval_item_main_topic][item_main_topic]

        list_of_similarity_scores.append(similarity)


      #Get a list_of_candidates containing at least 5 items based on their similarity to the main topic of eval_itemID
      while len(list_of_candidateIDs) < 5:
        #set new highest similarity score to max_similarity and find items with these main topics
        max_similarity = max(list_of_similarity_scores)
        max_similarity_indices = [i for i,similarity in enumerate(list_of_similarity_scores) if similarity == max_similarity]
        max_similarity_topics = [topic for i,topic in enumerate(list_of_unique_main_topics) if i in max_similarity_indices]

        #get list of candidates containing items whose main_topic is in max_similarity_topics
        candidate_indices = [i for i,topic in enumerate(list_of_main_topics) if topic in max_similarity_topics]
        list_of_candidateIDs += [ID for i,ID in enumerate(list_of_itemIDs) if i in candidate_indices and ID != eval_itemID]

        #remove maximum similarity score from list of similarity scores for the next iteration
        list_of_similarity_scores = list(set(list_of_similarity_scores)- set([max_similarity])) 

        if list_of_similarity_scores == []:
          num_missing_values = 5-len(list_of_candidateIDs)
          list_of_IDs_without_eval_itemID = list(set(list_of_itemIDs) - set([eval_itemID]))
          list_of_candidateIDs += np.random.choice(np.array(list_of_IDs_without_eval_itemID), size=num_missing_values, replace=False).tolist()

    #Get 5 recommendations from the list_of_candidates
    if len(list_of_candidateIDs) == 5:
      list_of_recommendations = list_of_candidateIDs
    else:
      list_of_recommendations = np.random.choice(np.array(list_of_candidateIDs), size=5, replace=False).tolist()
      
    recommendations_data.append(list_of_recommendations)

  recommendations_df = pd.DataFrame(recommendations_data, index=list_of_evalIDs, columns=["rec_1","rec_2","rec_3","rec_4","rec_5"])

  return recommendations_df
  

# Application

In [None]:
recommendations_df = get_recommendations(eval_df, item_df, method=1)

In [None]:
recommendations_df.to_csv("recommendations2.csv")

In [None]:
from google.colab import files
files.download("recommendations2.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>