# Online Learning

### mock user feedback

In [15]:
import yaml

relationship_size = 200
entry_size = 20

user_feedback = {
    int(mock_id): {
        int(contract_id) : int(like)
        for contract_id, like in zip(np.random.randint(0, 100000, size=relationship_size),
                                     np.random.randint(-2, 3, size=relationship_size))
    }
    for mock_id in np.random.randint(0, 100000, size=entry_size)
}
with open("user_likes.yaml", "w") as f:
    yaml.dump(user_feedback, f)
    
with open("user_likes.yaml", "r") as f:
    user_feedback = yaml.safe_load(f)


### online learning

In [6]:
import yaml
import numpy as np
from annoy import AnnoyIndex
from typing import List, Dict


ANN = AnnoyIndex(150, 'dot')
ANN.load("srcc/test_10.ann")
    
def embeddings_cosine_similarity(input_embedding, res: List[int]) -> Dict:
    return {_res: cosine_similarity(input_embedding, ANN.get_item_vector(_res)) for _res in res}

def double_sigmoid(x): 
    return 2/(1 + np.exp(-x)) 

def cosine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def single_access(input_embedding_id):
    try:
        like_history = user_feedback[input_embedding_id]
        return like_history
    except KeyError:
        return dict()

def rerank(cosine_similarities, likes_history):
    out = dict()
    for contract_id, cosine in cosine_similarities.items():
        try:
            reweight = cosine * double_sigmoid(likes_history[contract_id])
        except KeyError:
            reweight = cosine
        out[contract_id] = reweight
    return dict(sorted(out.items(), key=lambda tup: tup[1], reverse=True))


def ranker(input_embedding_id, input_embedding, res):
    cosine_similarities = embeddings_cosine_similarity(input_embedding, res)
    likes_history = single_access(input_embedding_id)
    return rerank(cosine_similarities, likes_history)



### this is the input we would have
- we don't have the embedding id a priori, for now we can simulate but we need a new strategy
- look at hashing solution below

In [None]:
K = 5
input_embedding = ANN.get_item_vector(input_embedding_id) # just simulating
res = ANN.get_nns_by_vector(input_embedding, K*4, search_k=20, include_distances=False)
ranker(input_embedding_id, input_embedding, res)

### let's manually alter the user feedback to simulate some likes

In [140]:
user_feedback[input_embedding_id].update({217568: 2, 191777: -2, 117530: 1})
ranker(input_embedding_id, input_embedding, res)

{217568: 1.3353272736020532,
 117530: 1.1319901804170318,
 56088: 0.9300999588103345,
 171634: 0.9055805760779316,
 155996: 0.8495007652574565,
 233832: 0.8313957189805125,
 69904: 0.8291999536072038,
 157285: 0.8256972007906522,
 192425: 0.818605772219394,
 175429: 0.8085693358296288,
 68347: 0.793539159098976,
 190502: 0.7914935205661225,
 239469: 0.7898787608151207,
 209273: 0.7846621402499501,
 89438: 0.7747676084711644,
 154353: 0.7641889763127058,
 134617: 0.7602365126794943,
 70325: 0.7298757246328769,
 112583: 0.6817744812208998,
 191777: 0.20311133470071033}

### ok, re-ranking is working

In [141]:
user_feedback[input_embedding_id].update({217568: 1, 191777: -1})
ranker(input_embedding_id, input_embedding, res)

{117530: 1.1319901804170318,
 217568: 1.1083170948824492,
 56088: 0.9300999588103345,
 171634: 0.9055805760779316,
 155996: 0.8495007652574565,
 233832: 0.8313957189805125,
 69904: 0.8291999536072038,
 157285: 0.8256972007906522,
 192425: 0.818605772219394,
 175429: 0.8085693358296288,
 68347: 0.793539159098976,
 190502: 0.7914935205661225,
 239469: 0.7898787608151207,
 209273: 0.7846621402499501,
 89438: 0.7747676084711644,
 154353: 0.7641889763127058,
 134617: 0.7602365126794943,
 70325: 0.7298757246328769,
 112583: 0.6817744812208998,
 191777: 0.45825261767182535}

- it's very strong, 1 like will push it to the top of the list easily
- I think this is good since we are going to have so little feedback given the task

### how do we know for a given embedding if it's in the index?
- annoy library doesn't return the id of itself in nearest neighbour search
- we might have to figure out a hashing strategy and put in the DB

In [145]:
hash(sum(input_embedding))

1883894426938435466

### build out "DB"

In [147]:
!pwd

/Users/pablo/Documents/Coding/company_challenges/OpenZeppelin


In [148]:
import os
import pandas as pd

base_dir = "src/embeddings/"

batches = list()
for _file in os.listdir(base_dir):
    if os.path.basename(_file).split(".")[-1] == "parquet":
        batches.append(pd.read_parquet(os.path.join(base_dir, _file)))
        
df = pd.concat(batches).reset_index(drop=True)
df.head()

Unnamed: 0,contract_path,embedding
0,/Users/pablo/Documents/Coding/company_challeng...,"[-918.1727203140035, -1907.009695449844, -863...."
1,/Users/pablo/Documents/Coding/company_challeng...,"[-2.729362176731229, -14.615228615701199, 0.98..."
2,/Users/pablo/Documents/Coding/company_challeng...,"[1093.0715518859215, -1386.3616010397673, -175..."
3,/Users/pablo/Documents/Coding/company_challeng...,"[2894.6187667399645, -362.2629002183676, -1859..."
4,/Users/pablo/Documents/Coding/company_challeng...,"[-310.338902737014, -774.37499769032, -900.542..."


In [166]:
df["hash"] = df["embedding"].apply(lambda arr: hash(arr.sum()))
n_unique = len(df["hash"].unique())
print(n_unique, df.shape[0])

394515 540000


- too many collisions. let's think of another way

In [174]:
%%time

def silly_hash(arr):
    return hash("".join([str(_fl) for _fl in arr]))
df["hash"] = df["embedding"].apply(silly_hash)
n_unique = len(df["hash"].unique())
print(n_unique, df.shape[0])

394667 540000
CPU times: user 54.8 s, sys: 42.7 ms, total: 54.8 s
Wall time: 54.9 s


- that didn't work either, lets try hashlib

In [185]:
import hashlib

def array_to_hashlib(arr):
    emb_str = "".join([str(_fl) for _fl in arr])
    return hashlib.sha256(bytes(emb_str, encoding='utf-8')).hexdigest()

array_to_hashlib(arr)

'67e7896987357c2bdba4d6be4c784c6592e1f9ff828344b6f0e177f2570a3ed8'

In [187]:
df["hashlib"] = df["embedding"].apply(array_to_hashlib)

In [186]:
%%timeit
array_to_hashlib(arr)

104 µs ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [188]:
df["hashlib"].value_counts()

c3baa45aff6319ebe72db3252563268c01ecddd4e6b61249e0d95120a24d6cd2    647
b8cb9fe66d7910f7167fb975c79e442e3f4179898c00a3e5d274cfbb67eb8a1a    546
902a402f93ddcb1f4c815170e6046fbff48179c09865a1c40c11c3cc213a5efb    520
f243a44c5802408ff7f7c6575d3e872c6bff6cd027c4a1e01772d8757c77a979    451
1932b7959bdf57c7b81a1fa09c8815f16e256632dabcf0048239abdabfda3bcf    421
                                                                   ... 
fe0e3676db4e7cd9bd915c0f6a8f36acb152284f19437378802c914132ccb3c7      1
d348eeb79ee1d1df7183d59bd62923ac1ee3534a36fdb3645222303d4a4c26b5      1
58b9b2c3031a41fdd446278b152d8f9fb94de7416dc3c00c884bedbca99b0e35      1
c1b5bc997ae528ecc7e0f1bac4e94be72da4de75337c2998b2cf3c7cedc71da5      1
5b14a3fa93116e69adda7c70802286a0f3d50b5e3f4ec5c223cc6d10f0f6c78a      1
Name: hashlib, Length: 394667, dtype: int64

- still collisions!

In [195]:
samples = df.query("hashlib=='c3baa45aff6319ebe72db3252563268c01ecddd4e6b61249e0d95120a24d6cd2'")["contract_path"].values[:2]
samples

array(['/Users/pablo/Documents/Coding/company_challenges/OpenZeppelin/smart-contract-sanctuary-ethereum/contracts/rinkeby/78/787be48981D71E99F9cD1050A16ED39de62Af74C_InitializableImmutableAdminUpgradeabilityProxy.sol',
       '/Users/pablo/Documents/Coding/company_challenges/OpenZeppelin/smart-contract-sanctuary-ethereum/contracts/rinkeby/8b/8bc3Bc29F0Fdd778148A2C5EB21Af2f8d1C81b43_InitializableImmutableAdminUpgradeabilityProxy.sol'],
      dtype=object)

In [196]:
def read_contract(contract_path : str) -> str:
    with open(contract_path, "r") as f:
        return f.read()

In [198]:
read_contract(samples[1]) == read_contract(samples[0])

True

- there's tons of duplicated smart contracts, however ANN is not returning itself or any duplicates.
- yeah in the index/db train pipeline we are going to want to de-duplicate
- looks like around 25% of contracts in the repo are exact replicas

### want to keep contract_path for downloading + embedding hash

In [200]:
tmp_contract_path = df.loc[3, "contract_path"]


In [219]:
import requests

raw_github_base = "https://raw.githubusercontent.com/tintinweb/smart-contract-sanctuary-ethereum/master/{}"
sanctuary_base = "/Users/pablo/Documents/Coding/company_challenges/OpenZeppelin/smart-contract-sanctuary-ethereum/"

def get_contract_name(full_path_contract):
    return full_path_contract.replace(sanctuary_base, "")

def get_contract_text(path_contract):
    requests.get(raw_github_base.format(contract_name)).text

In [3]:
import os
import pandas as pd
import hashlib

sanctuary_base = "/Users/pablo/Documents/Coding/company_challenges/OpenZeppelin/smart-contract-sanctuary-ethereum/"

def array_to_hashlib(arr):
    emb_str = "".join([str(_fl) for _fl in arr])
    return hashlib.sha256(bytes(emb_str, encoding='utf-8')).hexdigest()


def get_contract_name(full_path_contract):
    return full_path_contract.replace(sanctuary_base, "")


base_dir = "src/embeddings/"

batches = list()
for _file in os.listdir(base_dir):
    if os.path.basename(_file).split(".")[-1] == "parquet":
        batches.append(pd.read_parquet(os.path.join(base_dir, _file)))
        
df = (
    pd.concat(batches)
    .reset_index(drop=True)
)
df["hashlib"] = df["embedding"].apply(array_to_hashlib)
df["contract_path"] = df["contract_path"].apply(get_contract_name)
df = (
    df[["hashlib", "contract_path", "embedding"]]
    .drop_duplicates(subset=["hashlib"])
    .rename_axis("contract_id")
)
df

Unnamed: 0_level_0,hashlib,contract_path,embedding
contract_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,67e7896987357c2bdba4d6be4c784c6592e1f9ff828344...,contracts/rinkeby/47/4769e7f852b820f2482615dec...,"[-918.1727203140035, -1907.009695449844, -863...."
1,2ba3e496bca5d8bfc7fbdfbba616d578825fc7f38cde8e...,contracts/rinkeby/47/477b39f60683e2B908282838F...,"[-2.729362176731229, -14.615228615701199, 0.98..."
2,bbe3b93072d4f8cfb449115c2b7f9b7b031ea55ab9c152...,contracts/rinkeby/47/47f7543246bb52981f7a074ed...,"[1093.0715518859215, -1386.3616010397673, -175..."
3,b80173634d8b23421595161eec82a7259543506be67b20...,contracts/rinkeby/47/47d2c426bd65312653650a8c0...,"[2894.6187667399645, -362.2629002183676, -1859..."
4,8726cf05958a6f76e3d6a94895ea9cf112b35c11f2eccf...,contracts/rinkeby/47/478ce89f8dcdf3e79b4010df7...,"[-310.338902737014, -774.37499769032, -900.542..."
...,...,...,...
579995,29262987aa41c3cad037eb1d549f92259ea23aa915845e...,contracts/ropsten/e5/e5c09742fa5dc8e6c842eb358...,"[443.84467166662216, -446.2305064126849, -26.8..."
579996,b1b6a8d6a9b9492656c487d81e65f6d9bbe55e4c74ba71...,contracts/ropsten/e5/e5283bee782643d40a13b4146...,"[575.1903692749329, -897.6308063026518, -980.0..."
579997,15661d23d4c56acfd08a9e7e9c55f1d39861c210933e0f...,contracts/ropsten/e5/E579370ac67F15E3c99b7aC14...,"[1820.0796929262578, 316.1446752306074, -1378...."
579998,b8e7a64cb702cbcc5e57d0fea8f8d63bdd4af462500e63...,contracts/ropsten/e5/E581C6C80ec7C855967B1F573...,"[334.43862503487617, -1307.8066804911941, -251..."


In [10]:
embhash_to_id = (
    df
    .reset_index()
    [["hashlib", "contract_id"]]
    .set_index("hashlib")
    ["contract_id"]
    .to_dict()
)

In [53]:
def get_input_embedding_id(input_embedding):
    hashed_embedding = array_to_hashlib(input_embedding)
    try:
        return embhash_to_id[hashed_embedding]
    except KeyError:
        return -1


def ranker(input_embedding, res):
    cosine_similarities = embeddings_cosine_similarity(input_embedding, res)
    
    input_embedding_id = get_input_embedding_id(input_embedding)
    
    likes_history = single_access(input_embedding_id)
    return rerank(cosine_similarities, likes_history)


K = 5
input_embedding = np.array(ANN.get_item_vector(22)) # just simulating
res = ANN.get_nns_by_vector(input_embedding, K*4, search_k=20, include_distances=False)
ranker(input_embedding, res)




{171634: 0.9745001895117084,
 56088: 0.9681444812693083,
 191777: 0.9619913273942239,
 233832: 0.952983225278823,
 155996: 0.9523178044431038,
 69904: 0.9522339662657148,
 157285: 0.9484288954175107,
 175429: 0.9460155491971705,
 68347: 0.9423112713734029,
 190502: 0.9345894857976565,
 209273: 0.9280944289357173,
 210466: 0.9276763279659529,
 239469: 0.92531509994335,
 89438: 0.9229566217267036,
 134617: 0.9223624365202353,
 117530: 0.9203674254208029,
 217568: 0.9121757965156764,
 154353: 0.9055211354727617,
 70325: 0.8499681898575423,
 112583: 0.8245985926037704}

In [55]:

ranked_NN = ranker(input_embedding, res)
