# Snapshot Jaccard Similarity

To understand the impact of changes, you can compare the Jaccard Similarity of snapshots.

Please copy this example and customize it for your own purposes!

### Imports

In [1]:
import pandas as pd
import io
from js import fetch

## Define the Data You Want

In [2]:
CASE_ID = 6   # Your Case
SNAPSHOT_IDS = [1,2]   # Your Snapshots.  Use the Compare Snapshot function in Quepid to see what the specific ID's are of your snapshots.

### Jaccard Subroutines

In [3]:
## Calculation of Jaccard Similarity of List 1 and 2

def jaccard_similarity(list1, list2):
    print(list1, list2)
    if list1 == list2: 
        print('the lists are same')
        return float(1.0)
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [4]:
## Construction of a comparable list from Snapshot blob

def construct_comparable_list_from_snapshot_blob(snapshot):
    for data in snapshot:
        record = data.split("\n")
        #print(record)
        df = pd.DataFrame(record)
        df[['query','docid','rating']] = df[0].str.split(',',expand=True)
        ratings_df= df[['query','docid','rating']]
        
        # Drop first row as its just column names
        ratings_mod_df = ratings_df.drop(index=ratings_df.index[0])
        
        # Remove '?' if using ispy else the next step can be ignored
        ratings_mod_df['docid'] = ratings_mod_df['docid'].str.split('?').str.get(0)
        #print(ratings_mod_df.head(10))
        
        return ratings_mod_df

In [5]:
## Subroutine for calculating Jaccard Similarity between 2 Snapshots

def jaccard_similarity(A, B):
    # Compute Jaccard Similarity
    nominator = set(A).intersection(set(B))
    denominator = set(A).union(set(B))
    Jacc_similarity = len(nominator)/len(denominator)
    #print(Jacc_similarity) 
    return (Jacc_similarity) 

### Pull data directly from Quepid's snapshot repository to calculate Jaccard Similarity

In [6]:
# Retrieve from Quepid API from Case id - 6 and Snapshot id - 1
rating_snapshot_1 = []
res = await fetch(f'/api/export/ratings/{CASE_ID}.csv?file_format=basic_snapshot&snapshot_id={SNAPSHOT_IDS[0]}')
rating_snapshot_1.append(await res.text())
#print(rating_snapshot_1)

# Retrieve from Quepid API from Case id - 6 and Snapshot id - 2
rating_snapshot_2 = []
res = await fetch(f'/api/export/ratings/{CASE_ID}.csv?file_format=basic_snapshot&snapshot_id={SNAPSHOT_IDS[1]}')
rating_snapshot_2.append(await res.text())
#print(rating_snapshot_2)

### Read and transform data in a dataframe

In [7]:
df1 = construct_comparable_list_from_snapshot_blob(rating_snapshot_1)
df2 = construct_comparable_list_from_snapshot_blob(rating_snapshot_2)
df1 = df1.groupby('query')['docid'].apply(list).reset_index(name="results")
df2 = df2.groupby('query')['docid'].apply(list).reset_index(name="results")

df_jaccard = df1[['query']].copy()
df_jaccard['baseline_results'] = df1['results']
df_jaccard['comparison_results'] = df2['results']
df_jaccard['baseline_count'] = df_jaccard.apply(lambda row: len(row.baseline_results), axis = 1)
df_jaccard['comparison_count'] = df_jaccard.apply(lambda row: len(row.comparison_results), axis = 1)

### Add column with jaccard similarity

In [8]:
df_jaccard['jaccard_similarity'] = df_jaccard.apply(lambda row:jaccard_similarity(row.baseline_results, row.comparison_results), axis = 1)

In [9]:
df_jaccard.head(10)

Unnamed: 0,query,baseline_results,comparison_results,baseline_count,comparison_count,jaccard_similarity
0,,[None],[None],1,1,1.0
1,movie about a boxer who climbs,"[45317, 826, 46838, 683716, 769, 570731, 680, ...","[45317, 826, 46838, 683716, 769, 570731, 680, ...",10,10,1.0
2,star trek,"[193, 199, 188927, 200, 13475, 152, 201, 154, ...","[13363, 193, 199, 154, 152, 174, 157, 168, 188...",10,10,0.666667
3,star wars,"[11, 12180, 181808, 330459, 348350, 140607, 18...","[12180, 322506, 85, 1895, 18046, 11, 330459, 1...",10,10,0.538462


### Export data as CSV for reporting and sharing purpose

In [10]:
df_jaccard.to_csv('jaccard_similarity_results.csv', encoding='utf-8', index=False)

In [11]:
df_jaccard['jaccard_similarity'].mean()

0.8012820512820512