# Snapshot Jaccard Similarity

To understand the impact of changes, you can compare the Jaccard Similarity of snapshots.

Please copy this example and customize it for your own purposes!

### Imports

In [2]:
import pandas as pd
import io
from js import fetch
from tqdm import tqdm
tqdm.pandas()

### Jaccard Subroutines

In [4]:
## Calculation of Jaccard Similarity of List 1 and 2

def jaccard_similarity(list1, list2):
    print(list1, list2)
    if list1 == list2: 
        print('the lists are same')
        return float(1.0)
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [6]:
## Construction of a comparable list from Snapshot blob

def construct_comparable_list_from_snapshot_blob(snapshot):
    for data in snapshot:
        record = data.split("\n")
        #print(record)
        df = pd.DataFrame(record)
        df[['query','docid','rating']] = df[0].str.split(',',expand=True)
        ratings_df= df[['query','docid','rating']]
        
        # Drop first row as its just column names
        ratings_mod_df = ratings_df.drop(index=ratings_df.index[0])
        
        # Remove '?' if using ispy else the next step can be ignored
        ratings_mod_df['docid'] = ratings_mod_df['docid'].str.split('?').str.get(0)
        #print(ratings_mod_df.head(10))
        
        return ratings_mod_df

In [7]:
## Subroutine for calculating Jaccard Similarity between 2 Snapshots

def jaccard_similarity(A, B):
    # Compute Jaccard Similarity
    nominator = set(A).intersection(set(B))
    denominator = set(A).union(set(B))
    Jacc_similarity = len(nominator)/len(denominator)
    #print(Jacc_similarity) 
    return (Jacc_similarity) 

### Pull data directly from Quepid's snapshot repository to calculate Jaccard Similarity

In [8]:
# Retrieve from Quepid API from Case id - 6403 and Snapshot id - 2400
rating_snapshot_1 = []
res = await fetch(f'/api/export/ratings/6403.csv?file_format=basic_snapshot&snapshot_id=2400')
rating_snapshot_1.append(await res.text())
#print(rating_snapshot_1)

# Retrieve from Quepid API from Case id - 6403 and Snapshot id - 2401
rating_snapshot_2 = []
res = await fetch(f'/api/export/ratings/6403.csv?file_format=basic_snapshot&snapshot_id=2401')
rating_snapshot_2.append(await res.text())
#print(rating_snapshot_2)

### Read and transform data in a dataframe

In [9]:
df1 = construct_comparable_list_from_snapshot_blob(rating_snapshot_1)
df2 = construct_comparable_list_from_snapshot_blob(rating_snapshot_2)
df1 = df1.groupby('query')['docid'].apply(list).reset_index(name="results")
df2 = df2.groupby('query')['docid'].apply(list).reset_index(name="results")

df_jaccard = df1[['query']].copy()
df_jaccard['baseline_results'] = df1['results']
df_jaccard['comparison_results'] = df2['results']
df_jaccard['baseline_count'] = df_jaccard.progress_apply(lambda row: len(row.baseline_results), axis = 1)
df_jaccard['comparison_count'] = df_jaccard.progress_apply(lambda row: len(row.comparison_results), axis = 1)

   query                                              docid rating
1    fmr  https://www.hudexchange.info/programs/home/hom...      1
2    fmr  https://files.hudexchange.info/resources/docum...      3
3    fmr  https://www.hudexchange.info/resource/4694/pay...      3
4    fmr  https://files.hudexchange.info/resources/docum...      2
5    fmr  https://www.hudexchange.info/faqs/programs/811...      3
6    fmr  https://www.huduser.gov/portal/datasets/fmr/sm...      3
7    fmr  https://www.hudexchange.info/homelessness-assi...      3
8    fmr  https://www.hudexchange.info/faqs/reporting-sy...      1
9    fmr  https://www.hudexchange.info/faqs/programs/con...      2
10   fmr  https://www.hudexchange.info/programs/public-h...       
   query                                              docid rating
1    fmr  https://files.hudexchange.info/resources/docum...      2
2    fmr  https://files.hudexchange.info/resources/docum...      3
3    fmr  https://www.hudexchange.info/programs/home/hom...   

can't start new thread
  t = cls(total=total, **tqdm_kwargs)
100%|██████████| 102/102 [00:00<00:00, 33999.76it/s]
100%|██████████| 102/102 [00:00<00:00, 33999.76it/s]


### Add column with jaccard similarity

In [10]:
df_jaccard['jaccard_similarity'] = df_jaccard.progress_apply(lambda row:jaccard_similarity(row.baseline_results, row.comparison_results), axis = 1)

100%|██████████| 102/102 [00:00<00:00, 16999.21it/s]


In [11]:
df_jaccard.head(10)

Unnamed: 0,query,baseline_results,comparison_results,baseline_count,comparison_count,jaccard_similarity
0,,[None],[None],1,1,1.0
1,2 cfr 200,[https://www.hudexchange.info/resource/5621/fa...,[https://www.hudexchange.info/resource/5621/fa...,10,10,1.0
2,9902,[https://www.hudexchange.info/programs/housing...,[https://www.hudexchange.info/programs/housing...,10,10,1.0
3,aaq,[https://www.hudexchange.info/trainings/course...,[https://www.hudexchange.info/trainings/course...,10,10,1.0
4,affh,[https://www.hudexchange.info/trainings/course...,[https://www.hudexchange.info/trainings/course...,10,10,0.538462
5,ahar,[https://www.hudexchange.info/homelessness-ass...,[https://www.hudexchange.info/resource/1227/in...,10,20,0.75
6,annual action plan,[https://www.hudexchange.info/programs/consoli...,[https://www.hudexchange.info/programs/consoli...,10,20,1.0
7,application,[https://www.hudexchange.info/programs/housing...,[https://www.hudexchange.info/faqs/programs/yo...,10,10,1.0
8,apply,[https://www.hudexchange.info/faqs/programs/ne...,[https://www.hudexchange.info/faqs/programs/ho...,10,10,0.538462
9,apr,[https://www.hudexchange.info/programs/e-snaps...,[https://www.hudexchange.info/programs/e-snaps...,10,10,1.0


### Export data as CSV for reporting and sharing purpose

In [13]:
df_jaccard.to_csv('jaccard_similarity_results.csv', encoding='utf-8', index=False)