# Snapshot Jaccard Similarity

To understand the impact of changes, you can compare the Jaccard Similarity of snapshots.

Please copy this example and customize it for your own purposes!

### Imports

In [1]:
import pandas as pd
import io
from js import fetch
from tqdm import tqdm
tqdm.pandas()

### Jaccard Subroutines

In [2]:
## Calculation of Jaccard Similarity of List 1 and 2

def jaccard_similarity(list1, list2):
    print(list1, list2)
    if list1 == list2: 
        print('the lists are same')
        return float(1.0)
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [3]:
## Construction of a comparable list from Snapshot blob

def construct_comparable_list_from_snapshot_blob(snapshot):
    for data in snapshot:
        record = data.split("\n")
        #print(record)
        df = pd.DataFrame(record)
        df[['query','docid','rating']] = df[0].str.split(',',expand=True)
        ratings_df= df[['query','docid','rating']]
        
        # Drop first row as its just column names
        ratings_mod_df = ratings_df.drop(index=ratings_df.index[0])
        
        # Remove '?' if using ispy else the next step can be ignored
        ratings_mod_df['docid'] = ratings_mod_df['docid'].str.split('?').str.get(0)
        #print(ratings_mod_df.head(10))
        
        return ratings_mod_df

In [4]:
## Subroutine for calculating Jaccard Similarity between 2 Snapshots

def jaccard_similarity(A, B):
    # Compute Jaccard Similarity
    nominator = set(A).intersection(set(B))
    denominator = set(A).union(set(B))
    Jacc_similarity = len(nominator)/len(denominator)
    #print(Jacc_similarity) 
    return (Jacc_similarity) 

### Pull data directly from Quepid's snapshot repository to calculate Jaccard Similarity

In [5]:
# Retrieve from Quepid API from Case id - 6508 and Snapshot id - 2411
rating_snapshot_1 = []
res = await fetch(f'/api/export/ratings/6508.csv?file_format=basic_snapshot&snapshot_id=2411')
rating_snapshot_1.append(await res.text())
#print(rating_snapshot_1)

# Retrieve from Quepid API from Case id - 6508 and Snapshot id - 2412
rating_snapshot_2 = []
res = await fetch(f'/api/export/ratings/6508.csv?file_format=basic_snapshot&snapshot_id=2412')
rating_snapshot_2.append(await res.text())
#print(rating_snapshot_2)

### Read and transform data in a dataframe

In [6]:
df1 = construct_comparable_list_from_snapshot_blob(rating_snapshot_1)
df2 = construct_comparable_list_from_snapshot_blob(rating_snapshot_2)
df1 = df1.groupby('query')['docid'].apply(list).reset_index(name="results")
df2 = df2.groupby('query')['docid'].apply(list).reset_index(name="results")

df_jaccard = df1[['query']].copy()
df_jaccard['baseline_results'] = df1['results']
df_jaccard['comparison_results'] = df2['results']
df_jaccard['baseline_count'] = df_jaccard.progress_apply(lambda row: len(row.baseline_results), axis = 1)
df_jaccard['comparison_count'] = df_jaccard.progress_apply(lambda row: len(row.comparison_results), axis = 1)

can't start new thread
  t = cls(total=total, **tqdm_kwargs)
100%|██████████| 66/66 [00:00<00:00, 22001.59it/s]
100%|██████████| 66/66 [00:00<00:00, 21999.85it/s]


### Add column with jaccard similarity

In [7]:
df_jaccard['jaccard_similarity'] = df_jaccard.progress_apply(lambda row:jaccard_similarity(row.baseline_results, row.comparison_results), axis = 1)

100%|██████████| 66/66 [00:00<00:00, 13200.33it/s]


In [8]:
df_jaccard.head(10)

Unnamed: 0,query,baseline_results,comparison_results,baseline_count,comparison_count,jaccard_similarity
0,,[None],[None],1,1,1.0
1,300,"[17927, 1271, 11064, 10439, 2179, 9889, 10153,...","[17927, 1271, 11064, 10439, 2179, 9889, 10153,...",10,10,1.0
2,300 rise of an empire,"[53182, 98, 1891, 554152, 522627, 11, 11064, 9...",[53182],10,1,0.1
3,a lego movie,"[137106, 324849, 280217, 251471, 274862, 50499...","[137106, 324849, 280217, 251471, 274862, 50499...",10,10,1.0
4,annie,"[1700, 1162, 25209, 627, 703, 103, 688, 248, 8...","[1700, 1162, 25209, 627, 703, 103, 688, 248, 8...",10,10,1.0
5,battlestar galactica,[148980],[148980],1,1,1.0
6,black swan,"[44214, 128578, 5693, 84332, 411019, 117, 2732...","[44214, 128578, 5693, 84332, 411019, 117, 2732...",10,10,1.0
7,christmas vacation,"[5825, 80278, 771, 9479, 41521, 10437, 11153, ...","[5825, 80278, 771, 9479, 41521, 10437, 11153, ...",10,10,1.0
8,cocoon,"[15389, 11285]","[15389, 11285]",2,2,1.0
9,contact,"[14278, 679, 73, 82702, 199, 137113, 68730, 68...","[14278, 679, 73, 82702, 199, 137113, 68730, 68...",10,10,1.0


### Export data as CSV for reporting and sharing purpose

In [9]:
df_jaccard.to_csv('jaccard_similarity_results.csv', encoding='utf-8', index=False)

In [12]:
df_jaccard['jaccard_similarity'].mean()

0.7875583269968297