In [1]:
import warnings
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
warnings.filterwarnings('ignore')

In [3]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [4]:
# Importing Data
df = pd.read_json('projects.json')

In [6]:
# Changing null tags to empty lists
df.loc[df['tags'].isnull(),['tags']] = df.loc[df['tags'].isnull(),'tags'].apply(lambda x: []) 

In [7]:
# Changing null themes to empty lists
df.loc[df['themes'].isnull(),['themes']] = df.loc[df['themes'].isnull(),'themes'].apply(lambda x: []) 

In [8]:
# Merging themes and tags
df['themetag'] = df['themes'] + df['tags']


In [10]:
# Changing themetag to a string 
df['themetag'] = df['themetag'].agg(lambda x: ';'.join(map(str, x))) 

In [11]:
# create binary indicators for each theme/tag
# source: https://datascience.stackexchange.com/questions/14847/multiple-categorical-values-for-a-single-feature-how-to-convert-them-to-binary-u
df_stack = df[df['themetag'] != '(no themetag listed)'].set_index('portfolioId').themetag.str.split(';', expand = True).stack()
df_explode = pd.get_dummies(df_stack, prefix = 'g').groupby(level = 0).sum().reset_index()
del df_stack

In [12]:
# vectors of theme/tags
df_explode['themetag_vector'] = df_explode.iloc[:,1:].values.tolist()

In [14]:
# Adding list of vectors to data 
df = df.merge(df_explode[['portfolioId','themetag_vector']], on = 'portfolioId', how = 'left')

In [15]:
# Deleting rows with no themes or tags
df = df[df.themetag != ";"]

In [16]:
# Converting theme/tags from string to list
df['themetaglist'] = df.themetag.map(lambda x: x.split(';'))

In [17]:
# compute Jaccard Index to get 5 most similar projects 
def get_similar_projects(target_project,df):

  target_themetag_list = df[df.portfolioId == target_project].themetaglist.values[0]
  themetag_list_sim = df[['portfolioId','showcaseName','themetaglist','themetag']]
  themetag_list_sim['jaccard_sim'] = themetag_list_sim.themetaglist.map(lambda x: len(set(x).intersection(set(target_themetag_list))) / len(set(x).union(set(target_themetag_list))))
  print(f'Projects most similar to {target_project} based on themetag:')
  text = ','.join(themetag_list_sim.sort_values(by = 'jaccard_sim', ascending = False).head(25)['themetag'].values)
  recommended=themetag_list_sim.sort_values(by = 'jaccard_sim', ascending = False).head(6)
  recommended=recommended[1:]
  return recommended['portfolioId']
  

In [18]:
# compute Jaccard Index to get least similar project (Inspire Me)
def get_diff_projects(target_project,df):

  target_themetag_list = df[df.portfolioId == target_project].themetaglist.values[0]
  themetag_list_sim = df[['portfolioId','showcaseName','themetaglist','themetag']]
  themetag_list_sim['jaccard_sim'] = themetag_list_sim.themetaglist.map(lambda x: len(set(x).intersection(set(target_themetag_list))) / len(set(x).union(set(target_themetag_list))))
  print(f'Projects least similar to {target_project} based on themetag:')
  text = ','.join(themetag_list_sim.sort_values(by = 'jaccard_sim', ascending = False).head(25)['themetag'].values)
  recommended=themetag_list_sim.sort_values(by = 'jaccard_sim', ascending = True).head(1)
  return recommended['portfolioId']