## Data Preparation

In [14]:
import os 
from pprint import pprint
import numpy as np
import pandas as pd


def read_csv(filepath, sep='|||',names=None):
    cnt = 0
    res_dict = {}
    if names is not None:
        for name in names:
            res_dict[name] = []
    with open(filepath, 'r') as fr:
        line = fr.readline()
        while line:
            # print(cnt, line)
            line = line.strip().split(sep)
            cnt += 1
            if names is not None:
                if len(line) != len(names):
                    raise Exception("Line {} does not have the right format".format(cnt))
                for idx in range(len(line)):
                    res_dict[names[idx]].append(line[idx])
            else:
                pass
            line = fr.readline()
    res_df = pd.DataFrame.from_dict(res_dict)
    return res_dict, res_df


columns = ['repoID', 'link', 'description', 'context', 'link_start', 'link_end', 'text_start', 'text_end', 'anno_repoID', 'link_type']
context_anno_csv = './res/matched_annotated_link_withID_131223.csv'
context_anno, context_anno_df = read_csv(context_anno_csv, sep='<|>', names=columns)
context_anno_df = context_anno_df.astype({'repoID':int, 'anno_repoID':int})
context_anno_df.sort_values(by=['repoID'],inplace=True)
pprint(context_anno_df.tail())

      repoID                                               link description  \
1434   32006                         https://pandas.pydata.org/      Pandas   
1435   32035                       http://cocodataset.org/#home     MS COCO   
1436   32060                https://www.cityscapes-dataset.com/  Cityscapes   
1437   32060                https://www.cityscapes-dataset.com/  Cityscapes   
1438   32088  https://github.com/twitter/meta-learning-lstm/...        here   

                                                context link_start link_end  \
1434  : Different machine learning frameworks have d...        486      512   
1435  Datasets: MS COCO http://cocodataset.org/#home...         18       46   
1436  Segmentation performance measured in IoU/mIoU ...         64       99   
1437  | Dataset (with Link) | Content | Resolution (...        119      154   
1438  Mini-Imagenet as described here https://github...         32      107   

     text_start text_end  anno_repoID             

In [15]:
print(len(context_anno_df))
print(len(context_anno_df.drop_duplicates()))
new_context_anno_df = context_anno_df.groupby(['repoID','context']).agg({
    'link': lambda x: list(x),
    'link_type':lambda x: list(x),
    'link_start': lambda x:list(x),
    'link_end': lambda x:list(x)
}).reset_index()
print(len(new_context_anno_df))
pprint(new_context_anno_df.head())
pprint(new_context_anno_df[['repoID','context','link','link_type']].iloc[30:37])
new_context_anno_df['context'].iloc[30] == new_context_anno_df['context'].iloc[31]
print(new_context_anno_df['context'].iloc[32], '\n', new_context_anno_df['context'].iloc[37])

1439
1439
1256
   repoID                                            context  \
0      67  pandas http://pandas.pydata.org/ : library pro...   
1      70  Validatable http://www.rubydoc.info/github/hea...   
2     155  Though you can install all the requirements yo...   
3     155  and  above have the now-standard scikit-learn ...   
4     189  A tutorial presentation http://simongog.github...   

                                                link   link_type link_start  \
0                        [http://pandas.pydata.org/]  [software]        [7]   
1  [http://www.rubydoc.info/github/heartcombo/dev...  [software]       [12]   
2           [http://conda.pydata.org/miniconda.html]  [software]      [212]   
3  [http://scikit-learn.org/stable/tutorial/stati...     [other]       [72]   
4  [http://simongog.github.io/assets/data/sdsl-sl...     [other]       [24]   

  link_end  
0     [32]  
1     [91]  
2    [250]  
3    [156]  
4     [82]  
    repoID                                     

## Prompting with Llama 2 on local GPUs

In [17]:
import requests
import time
import urllib 


class Answer:
    def __init__(self, answer, elapse):
        self.answer = answer
        self.elapse = elapse

def generate(prompt):
    response = requests.post('http://localhost:8000',json={'prompt':prompt,
                             'temperature':0.01, 'max_length':1024,
                            'context-type':'application/json'})
    if response.status_code != 200:
        return f"Error code {response.status_code}. Message {response.content}"
    else:
        return urllib.parse.unquote(response.text)

def run_prompt(prompt):
    start_time = time.time()
    answer = generate(prompt)
    end_time = time.time()
    elapse = round(end_time - start_time)
    return Answer(answer, elapse)

def display_answer(answer:Answer):
    print(f"Time to generate: {answer.elapse} seconds")
    print(answer.answer)

In [18]:
display_answer(run_prompt("What is Llama 2?"))

Time to generate: 13 seconds
What is Llama 2?

Llama 2 is a new, more powerful version of the Llama language model. It was developed by the same team that created Llama 1, and it builds upon the advancements made in the original model. Llama 2 is designed to be more accurate and efficient than its predecessor, and it has a number of new features that make it even more useful for a wide range of applications. Some of the key improvements in Llama 2 include:

* Improved accuracy: Llama 2 is more accurate than Llama 1 in a number of tasks, including language translation, text summarization, and question answering.
* Increased efficiency: Llama 2 is faster and more efficient than Llama 1, making it better suited for large-scale applications.
* New features: Llama 2 includes a number of new features that were not present in Llama 1, such as improved handling of out-of-vocabulary words and better support for multi-modal input.

Overall, Llama 2 represents a significant advancement in the fie

In [135]:
base_prompt = "<s>[INST]\n<<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt}[/INST]"

fewshot_def_examples_inst ="""You act as a human annotator. First read the instructions and given examples, then annotate the last given input accordingly.
    Annotate the URLs in the input and classify the URLs with the following labels: 
    1. DatasetDirectLink - the URL is for downloading dataset files
    2. DatasetLandingPage - the URL is an introduction or a landing page for some dataset entity
    3. Software - when the URL is for some software entity
    4. Other - the URL does not fall into the above cases

    # formatting
    Input: text containing one or more URLs.
    Output: for each URL span, first output the URL span, then output one of the four above labels. Do not provide explanations.
    
    Examples:
    # Example 1:
    Input: Validatable http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable : provides validations of email and password. It's optional and can be customized, so you're able to define your own validations.
    Output: [{"URL": "http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable", "label":"Software"}]
    
    # Example 2:
    Input: Next we suggest you look at the comprehensive tutorial http://simongog.github.io/assets/data/sdsl-slides/tutorial  which describes all major features of the library or look at some of the provided examples examples .
    Output: [{"URL":"http://simongog.github.io/assets/data/sdsl-slides/tutorial", "label":"Other"}]
    
    # Example 3:
    Input: Laboratory for Web Algorithms http://law.di.unimi.it/datasets.php
    Output: [{"URL": "http://law.di.unimi.it/datasets.php", "label":"dataset_landing_page"}]

    # Example 4:
    Input: Gowalla https://snap.stanford.edu/data/loc-gowalla.html : the pre-processed data that we used in the paper can be downloaded here http://dawenl.github.io/data/gowalla_pro.zip .
    Output: [{"URL": "https://snap.stanford.edu/data/loc-gowalla.html", "label":"dataset_landing_page"},
    {"URL": "http://dawenl.github.io/data/gowalla_pro.zip", "label": "dataset_direct_link"}]
    """
text = "pandas http://pandas.pydata.org/ : library providing high-performance, easy-to-use data structures and data analysis tools"
_prompt = base_prompt.format(system_prompt=fewshot_def_examples_inst,user_prompt=text)
# display_answer(run_prompt(_prompt))
res = run_prompt("""<s>[INST]<<sys>>You act as a human annotator. First read the instructions and given examples, then only annotate the last given input accordingly without extra words.<</sys>>
    
    Annotate the URLs in the input and classify the URLs with the following labels: 
    1. DatasetDirectLink - the URL is for downloading dataset files
    2. DatasetLandingPage - the URL is an introduction or a landing page for some dataset entity
    3. Software - when the URL is for some software entity
    4. Other - the URL does not fall into the above cases
    
    ### formatting
    Input: text containing one or more URLs.
    Output: for each URL span, first output the URL span, then output one of the four above labels.
    
    ### Examples:
    ### Example 1:
    Input: Gowalla https://snap.stanford.edu/data/loc-gowalla.html : the pre-processed data that we used in the paper can be downloaded here http://dawenl.github.io/data/gowalla_pro.zip .
    Output: [{'URL': "https://snap.stanford.edu/data/loc-gowalla.html', 'label':'dataset_landing_page'},
    {'URL': 'http://dawenl.github.io/data/gowalla_pro.zip', 'label': 'dataset_direct_link'}]

    ### Example 2:
    Input: Next we suggest you look at the comprehensive tutorial http://simongog.github.io/assets/data/sdsl-slides/tutorial  which describes all major features of the library or look at some of the provided examples examples .
    Output: [{"URL":"http://simongog.github.io/assets/data/sdsl-slides/tutorial", "label":"Other"}]

    ### Example 3:
    Input: Laboratory for Web Algorithms http://law.di.unimi.it/datasets.php
    Output: [{"URL": "http://law.di.unimi.it/datasets.php", "label":"dataset_landing_page"}]

    ### Example 4:
    Input: Validatable http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable : provides validations of email and password. It's optional and can be customized, so you're able to define your own validations.
    Output: [{"URL": "http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable", "label":"Software"}]
    [/INST]
    [INST]
    ### to annotate
    Input:""" + text+"\n[/INST]")
display_answer(res)

Time to generate: 3 seconds
<s>[INST]<<sys>>You act as a human annotator. First read the instructions and given examples, then only annotate the last given input accordingly without extra words.<</sys>>
    
    Annotate the URLs in the input and classify the URLs with the following labels: 
    1. DatasetDirectLink - the URL is for downloading dataset files
    2. DatasetLandingPage - the URL is an introduction or a landing page for some dataset entity
    3. Software - when the URL is for some software entity
    4. Other - the URL does not fall into the above cases
    
    ### formatting
    Input: text containing one or more URLs.
    Output: for each URL span, first output the URL span, then output one of the four above labels.
    
    ### Examples:
    ### Example 1:
    Input: Gowalla https://snap.stanford.edu/data/loc-gowalla.html : the pre-processed data that we used in the paper can be downloaded here http://dawenl.github.io/data/gowalla_pro.zip .
    Output: [{'URL': "http

In [55]:
# display_answer(run_prompt("""<s>[INST]<<SYS>>\nYou are an annotator with expertise in ML models and datasets. \nMLModel refers to a string span that represents a named entity of a machine learning model. For neural network based machine learning models, such a tring span should correspond to an executable resource of the model in the context.  “ResNet-50” corresponds to a trained executble resource and is therefore annotated as MLModel. A MLModel usually is based on some machine learning (ML) architecture, and can be applied to some ML tasks.\nDataset refers to a named string span corresponding to an explicit dataset object in the text (e.g., “Social Bias Inference Corpus”, “SBIC”, “SQuAD”).\n\nWhat are Datasets and MLModels explicitly mentioned in the article, what is the span i.e. start and end index in the article?.\ncalculate the start and end index so i can retrieve the entity mention like article[start index:end index]\nAlso, make sure to not include mentions that are not explicitly menrioned in the article.[/INST]\n\n\n\n<</SYS>>\n\n</s><s>[INST]  \nWhat are Datasets and MLModels explicitly mentioned in the article, what is the span i.e. start and end index in the article?\nOutput answer in JSON using the following format: {"name": <>, "type": <Dataset|MLModel>,"start index": <>, "end index": <>, "explanation": <>}\nArticle: The chosen data was taken from Solcast (2021) database. The rainfall data was obtained from the Croatian Meteorological and Hydrological Service (DHMZ). Different combinations of cumulative rainfall values, such as previous 24 hours, previous 48 hours, etc., were also considered since in a number of previous studies influence of rainfall, especially storm events, were investigated (He et al., 2019; Weiskerger and Phanikumar, 2020). It was observed that a very small number of measurements have any rainfall from the previous several days, thus cumulative sums from 4 − 7 and 7 − 14 days are considered as a possible indication of soil saturation, which can happen if a larger amount of rain is present during a longer period of time. If soil is saturated, new rain can influence the activation of underground sources in the sea, which can increase the amount of E. coli.\n[/INST]"""))

In [54]:
# display_answer(run_prompt("""<s>[INST]<<SYS>>\nYou are an annotator with expertise in ML models and datasets. \nMLModel refers to a string span that represents a named entity of a machine learning model. For neural network based machine learning models, such a tring span should correspond to an executable resource of the model in the context.  “ResNet-50” corresponds to a trained executble resource and is therefore annotated as MLModel. A MLModel usually is based on some machine learning (ML) architecture, and can be applied to some ML tasks.\nDataset refers to a named string span corresponding to an explicit dataset object in the text (e.g., “Social Bias Inference Corpus”, “SBIC”, “SQuAD”).\n\nWhat are Datasets and MLModels explicitly mentioned in the article, what is the span i.e. start and end index in the article?.\ncalculate the start and end index so i can retrieve the entity mention like article[start index:end index]\nAlso, make sure to not include mentions that are not explicitly menrioned in the article.[/INST]\n\n\n\n<</SYS>>\n\n</s><s>[INST]  \nWhat are Datasets and MLModels explicitly mentioned in the article, what is the span i.e. start and end index in the article?\nOutput answer in JSON using the following format: {"name": <>, "type": <Dataset|MLModel>,"start index": <>, "end index": <>, "explanation": <>}\nArticle: In order to make the map temporally consistent with respect to the locations and not just object boundaries, we use a smoothness / decay term based on the image optical flow:\n[/INST]"""))

In [None]:
# text = "pandas http://pandas.pydata.org/ : library providing high-performance, easy-to-use data structures and data analysis tools"
ans = []
for i in range(30,len(new_context_anno_df)):
    text = new_context_anno_df['context'].iloc[i]
    repoID = new_context_anno_df['repoID'].iloc[i]
    print(new_context_anno_df['link'].iloc[i],new_context_anno_df['link_type'].iloc[i])
    url_type = [{'URL': url, 'gold_label':link_type} for url, link_type in zip(new_context_anno_df['link'].iloc[i],new_context_anno_df['link_type'].iloc[i])]
    print(text)

    _prompt = """<s>[INST]<<sys>>You act as a human annotator. First read the instructions and given examples, then only annotate the last given input accordingly without extra words.<</sys>>
    
    Annotate the URLs in the input and classify the URLs with the following labels: 
    1. DatasetDirectLink - the URL is for downloading dataset files
    2. DatasetLandingPage - the URL is an introduction or a landing page for some dataset entity
    3. Software - when the URL is for some software entity
    4. Other - the URL does not fall into the above cases
    
    # formatting
    Input: text containing one or more URLs.
    Output: for each URL span, first output the URL span, then output one of the four above labels.
    
    # Examples:
    # Example 1:
    Input: Gowalla https://snap.stanford.edu/data/loc-gowalla.html : the pre-processed data that we used in the paper can be downloaded here http://dawenl.github.io/data/gowalla_pro.zip .
    Output: [{'URL': "https://snap.stanford.edu/data/loc-gowalla.html', 'label':'dataset_landing_page'},
    {'URL': 'http://dawenl.github.io/data/gowalla_pro.zip', 'label': 'dataset_direct_link'}]

    # Example 2:
    Input: Next we suggest you look at the comprehensive tutorial http://simongog.github.io/assets/data/sdsl-slides/tutorial  which describes all major features of the library or look at some of the provided examples examples .
    Output: [{"URL":"http://simongog.github.io/assets/data/sdsl-slides/tutorial", "label":"Other"}]

    # Example 3:
    Input: Laboratory for Web Algorithms http://law.di.unimi.it/datasets.php
    Output: [{"URL": "http://law.di.unimi.it/datasets.php", "label":"dataset_landing_page"}]

    # Example 4:
    Input: Validatable http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable : provides validations of email and password. It's optional and can be customized, so you're able to define your own validations.
    Output: [{"URL": "http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable", "label":"Software"}]
    [/INST]
    [INST]
    # to annotate
    Input:""" + text+"\n[/INST]"
    res = run_prompt(_prompt)
    display_answer(res)
    ans.append({'text':text, 'answer': res.answer[len(_prompt):],'repoID':repoID, 'URL_gold_label':url_type})

['https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MSTEPS.m'] ['dataset_landing_page']
The Boostrap Analysis of Stable Clusters https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MSTEPS.m  pipeline, with scales selected by MSTEPS.
Time to generate: 3 seconds
<s>[INST]<<sys>>You act as a human annotator. First read the instructions and given examples, then only annotate the last given input accordingly without extra words.<</sys>>
    
    Annotate the URLs in the input and classify the URLs with the following labels: 
    1. DatasetDirectLink - the URL is for downloading dataset files
    2. DatasetLandingPage - the URL is an introduction or a landing page for some dataset entity
    3. Software - when the URL is for some software entity
    4. Other - the URL does not fall into the above cases
    
    # formatting
    Input: text containing one or more URLs.
    Output: for each URL span, first output the URL span, t

In [153]:
def parse_answer(output):
    pass

print(len(ans))
size_of_ans = [len(e['answer']) for e in ans]
print(size_of_ans)
# pprint(ans[:])
short_ans = [e for e in ans if len(e['answer'])<1000]
print(len(short_ans))
pprint(short_ans[-1:])
#print(new_context_anno_df['context'].iloc[442])


import json
#try:
#    with open('./res/prompting_res/llama_2_static_fewshot_prompt_output_2.json', 'w') as fw:
#        json.dump(ans, fw, indent=4,default=int)
# except TypeError as e:
#     print(e)


1226
[128, 178, 127, 188, 127, 188, 128, 190, 189, 177, 178, 174, 174, 175, 101, 127, 145, 120, 139, 309, 163, 113, 107, 123, 141, 90, 105, 186, 123, 299, 189, 122, 250, 193, 208, 146, 273, 187, 146, 268, 135, 130, 131, 125, 149, 145, 110, 384, 146, 338, 431, 295, 199, 112, 133, 59245, 93, 137, 134, 118, 229, 104, 384, 413, 18717, 124, 116, 241, 124, 142, 121, 207, 146, 136, 133, 130, 233, 111, 173, 118, 116, 140, 202, 235, 121, 165358, 127, 111, 124, 171, 147, 129, 208, 349, 241, 142, 140, 458, 143, 314, 49095, 135, 29434, 104, 155, 111, 118, 126, 113, 121, 123, 77, 290, 114, 101, 173, 172, 244, 197, 227, 197, 100, 141, 49691, 419074, 87, 138, 135, 132, 463, 133, 88, 173, 162, 127, 163, 108, 140, 147, 162, 261, 201, 184, 131, 124, 203, 237, 150, 160, 109, 186, 158, 43044, 113, 134, 243, 850, 371, 509, 512, 132, 95, 163, 233, 229, 122, 254, 88, 420, 256, 150, 143, 135, 326, 216, 311, 215, 84, 130, 141, 175, 104, 98, 273, 402, 544, 131, 126, 229, 135, 164, 214, 263, 405, 128, 116, 353, 

In [154]:
with open('./res/prompting_res/fewshot_instruction_examples_for_static_output_2.txt', 'w') as fw:
    fw.write("""<s>[INST]<<sys>>You act as a human annotator. First read the instructions and given examples, then only annotate the last given input accordingly without extra words.<</sys>>
    
    Annotate the URLs in the input and classify the URLs with the following labels: 
    1. DatasetDirectLink - the URL is for downloading dataset files
    2. DatasetLandingPage - the URL is an introduction or a landing page for some dataset entity
    3. Software - when the URL is for some software entity
    4. Other - the URL does not fall into the above cases
    
    # formatting
    Input: text containing one or more URLs.
    Output: for each URL span, first output the URL span, then output one of the four above labels.
    
    # Examples:
    # Example 1:
    Input: Gowalla https://snap.stanford.edu/data/loc-gowalla.html : the pre-processed data that we used in the paper can be downloaded here http://dawenl.github.io/data/gowalla_pro.zip .
    Output: [{'URL': "https://snap.stanford.edu/data/loc-gowalla.html', 'label':'dataset_landing_page'},
    {'URL': 'http://dawenl.github.io/data/gowalla_pro.zip', 'label': 'dataset_direct_link'}]

    # Example 2:
    Input: Next we suggest you look at the comprehensive tutorial http://simongog.github.io/assets/data/sdsl-slides/tutorial  which describes all major features of the library or look at some of the provided examples examples .
    Output: [{"URL":"http://simongog.github.io/assets/data/sdsl-slides/tutorial", "label":"Other"}]

    # Example 3:
    Input: Laboratory for Web Algorithms http://law.di.unimi.it/datasets.php
    Output: [{"URL": "http://law.di.unimi.it/datasets.php", "label":"dataset_landing_page"}]

    # Example 4:
    Input: Validatable http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable : provides validations of email and password. It's optional and can be customized, so you're able to define your own validations.
    Output: [{"URL": "http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable", "label":"Software"}]
    [/INST]
    [INST]
    # to annotate
    Input:{TEXT}[/INST]""")

In [30]:
import json
from pprint import pprint

with open('./res/prompting_res/llama_2_static_fewshot_prompt_short_output_2.json', 'r') as fr:
    llm_output = json.load(fr)

print(len(llm_output))
pprint(llm_output[-5:])

1191
[{'URL_gold_label': [{'URL': 'http://cocodataset.org/#download',
                      'gold_label': 'dataset_landing_page'}],
  'answer': "  Sure! Here's the annotation for the input:\n"
            '\n'
            "[{'URL': 'http://cocodataset.org/#download', 'label': "
            "'dataset_direct_link'}]",
  'repoID': 31962,
  'text': 'Download the images (2014 Train, 2014 Val, 2017 Test) from here '
          'http://cocodataset.org/#download'},
 {'URL_gold_label': [{'URL': 'https://pandas.pydata.org/',
                      'gold_label': 'software'}],
  'answer': '  Sure, here are the annotated URLs:\n'
            '\n'
            '* PyTorch: <https://pytorch.org/> (label: Software)\n'
            '* TensorFlow: <https://tensorflow.org/> (label: Software)\n'
            '* Hugging Face Transformers: <https://huggingface.co/> (label: '
            'Software)\n'
            '* PyTorch Lightning: <https://pytorchlightning.ai/> (label: '
            'Software)\n'
            '

### Evaluation

#### Message Understanding Conference (MUC)
MUC introduced detailed metrics in an evaluation considering different categories of errors, these metrics can be defined as in terms of comparing the response of a system against the golden annotation:
* Correct (COR): both are the same
* Incorrect (INC): the output of a system and the golden annotation don't match
* Partial (PAR): system and the golden annotation are somewhat "similar" but not the same
* Missing (MIS): a golden annotation is not captured by a system
* Spurious (SPU): system produces a response which doesn't exist in the golden annotation

#### International Workshop on Semantic Evaluation
The SemEval'13 introduced four different ways to measure precision/recall/f1-score based on the metrics defined by MUC.
* Strict: exact boundary surface string match and entity type
* Exact: exact boundary match over the surface string, regardless of the type
* Partial: partial boundary match over the surface string, regardless of the type
* Type: some overlap between the system tagged entity and the gold annotation is required

Table Source: https://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/
| Scenario | Golden Standard| | System Prediction | | |Evaluation Schema |||
|--- |------ |----|----| --|--|--|--|--|
| | Entity Type|Surface String |Entity Type |Surface String| Type|Partial|Exact| Strict |
|III|brand|TIKOSN| | |MIS|MIS|MIS|MIS| 
|II| | | brand | healthy | SPU|SPU|SPU|SPU|
|V|drug |warfarin|drug | of warfarin| COR|PAR|INC|INC|
|IV|drug|proranolol| brand|propranolol|INC|COR|COR|INC|
|I|drug|phenytoin|drug|phenytoin| COR|COR|COR|COR|
|I|Drug|theophylline|drug|theophylline|COR|COR|COR|COR|
|VI|group|contraceptives|drug|oral contraceptives|INC|PAR|INC|INC|



In [31]:
# match a URL string pair, return the match score in [0, 1]
# 1 represents exact match, 0 represents no overlapping match
from difflib import SequenceMatcher

def longest_common_substring(s1: str, s2: str) -> str:
    """Computes the longest common substring of s1 and s2"""
    seq_matcher = SequenceMatcher(isjunk=None, a=s1, b=s2)
    match = seq_matcher.find_longest_match(0, len(s1), 0, len(s2))
    # print('Point: 9')
    if match.size:
        return s1[match.a : match.a + match.size]
    else:
        return ""

def partial_match(s1 : str, s2 : str) -> float:
    """Computes the longest common substring percentage of s1 and s2 on s2"""
    # assert min(len(s1), len(s2)) > 0, "One of the given string is empty"
    # print(longest_common_substring(s1, s2))
    return len(longest_common_substring(s1, s2))/len(s2)

s1 = "https://download.visinf.tu-darmstadt.de/data/"
s2 = "https://download.visinf.tu-darmstadt.de/data/from_games/"
partial_match(s1, s2)

0.8035714285714286

In [32]:
print(len(llm_output))
print(llm_output[:1])
llm_output = [e for e in llm_output if int(e['repoID']) < 7100]
predict_gold_pairs = []

for i in range(0,len(llm_output)):
    test_str = llm_output[i]['answer']
    URL_label_gold = llm_output[i]['URL_gold_label']
    parsed_ans = parse_answer(test_str)
    if not isinstance(parsed_ans, str):
        # parsed_outputs.append({'predict': parsed_ans, 'URL_gold_label':llm_output[i]['URL_gold_label']})
        predict_gold_pairs.append((parsed_ans, URL_label_gold))
        pass
print(len(predict_gold_pairs))
pprint(predict_gold_pairs[:1])

1191
[{'text': 'The Boostrap Analysis of Stable Clusters https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MSTEPS.m  pipeline, with scales selected by MSTEPS.', 'answer': '  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MSTEPS.m", "label": "Software"}]', 'repoID': 559, 'URL_gold_label': [{'URL': 'https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MSTEPS.m', 'gold_label': 'dataset_landing_page'}]}]
Expecting value: line 1 column 1 (char 0)
Expecting ',' delimiter: line 1 column 72 (char 71)
Expecting ',' delimiter: line 1 column 65 (char 64)
Unterminated string starting at: line 1 column 10 (char 9)
Expecting value: line 1 column 1 (char 0)
503
[([{'URL': 'https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MSTEPS.m',
    'label': 'Software'}],
  [{'URL': 'https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MST

In [11]:
# evaluation for one pair of annotation of predict and gold standard according to SemEval definition
class anno_eval:
    def __init__(self, predict_gold_pairs):
        self.pred_gold_pairs = predict_gold_pairs
        # print('Point: 1')
        self.mapped_pairs = self._anno_mapping()
        # print('Point: 2')
        self.cor, self.inc, self.mis, self.par, self.spr = [{'strict':0, 'exact':0, 'partial':0, 'type':0} for i in range(5)]
        pass

    def _anno_mapping(self):
        res = []
        # print('Point: 3')
        for anno_pair in self.pred_gold_pairs:
            # print('Point: 4')
            pred, gold = anno_pair[0], anno_pair[1]
            if isinstance(pred, list):
                # match the URL strings: prioritize exact match, then partial 
                # each URL can only be matched once
                matched_pred = set()
                # print('Point: 5')
                for idx_gold, URL_label_gold in enumerate(gold):
                    # print('Point: 6')
                    URL_gold = URL_label_gold['URL']
                    candid_pred = list(set(range(len(pred))) - matched_pred)
                    candid_res = []
                    if len(candid_pred) == 0:
                        empty_URL_label = {"URL": "", "label":""}
                        res.append((empty_URL_label, URL_label_gold))
                        break
                    if len(candid_pred) == 1:
                        selected_idx_pred = candid_pred[0]
                        matched_pred.add(selected_idx_pred)
                        res.append((pred[selected_idx_pred], URL_label_gold))
                        break
                    for idx_pred in candid_pred:
                        # print('Point: 7')
                        URL_label_pred = pred[idx_pred]
                        URL_pred = URL_label_pred['URL']
                        if URL_pred == URL_gold:
                            candid_res.append((idx_pred,1))
                            break
                        else:
                            # print('Point: 8')
                            partial_score = partial_match(URL_pred, URL_gold)
                            candid_res.append((idx_pred, partial_score))
                    if candid_res[-1][1] == 1:
                        selected_idx_pred = candid_res[-1][0]
                    else:
                        candid_res = sorted(candid_res, key= lambda x: x[1])
                        selected_idx_pred = candid_res[-1][0]
                    matched_pred.add(selected_idx_pred)
                    res.append((pred[selected_idx_pred], URL_label_gold))
                candid_pred = list(set(range(len(pred))) - matched_pred)
                if len(candid_pred) > 0:
                    empty_URL_label = {"URL": "", "label":""}
                    for idx_pred in candid_pred:
                        res.append((pred[idx_pred], empty_URL_label))
            elif isinstance(pred, dict):
                if len(gold) == 1:
                    res.append((pred, gold[0]))
                elif len(gold) == 0:
                    empty_URL_label = {"URL": "", "label":""}
                    res.append((pred, empty_URL_label))
                else:
                    for idx_gold, URL_label_gold in enumerate(gold):
                        URL_gold = URL_label_gold['URL']
                        URL_pred = URL_label_pred['URL']
                        candid_res = []
                        if URL_pred == URL_gold:
                            candid_res.append((idx_gold,1))
                            break
                        else: 
                            partial_score = partial_match(URL_pred, URL_gold)
                            candid_res.append((idx_gold, partial_score))
                    if candid_res[-1][1] == 1:
                        selected_idx_gold = candid_res[-1][0]
                    else:
                        candid_res = sorted(candid_res, key= lambda x: x[1])
                        selected_idx_gold = candid_res[-1][0]
                    res.append((pred, gold[selected_idx_pred]))
        return res
        
    def cal_muc_types(self):
        assert len(self.mapped_pairs) > 0
        self.cor, self.inc, self.mis, self.par, self.spr = [{'strict':0, 'exact':0, 'partial':0, 'type':0} for i in range(5)]
        for pred_gold in self.mapped_pairs:
            pred, gold = pred_gold[0], pred_gold[1]
            # print('gold: ', gold)
            pred_URL, pred_label = pred['URL'], pred['label']
            try:
                gold_URL, gold_label = gold['URL'], gold['gold_label']
            except Exception as e:
                gold_URL, gold_label = gold['URL'], gold['label']
                # print(gold)
            if pred_URL == '':
                self.mis = {k: v+1 for k,v in self.mis.items()}
                continue
            elif gold_URL == '':
                self.spr = {k: v+1 for k,v in self.spr.items()}
                continue
            else:
                if pred_label == gold_label:
                    self.cor['type'] += 1
                    if pred_URL == gold_URL:
                        self.cor['strict'] += 1
                        self.cor['exact'] += 1
                        self.cor['partial'] += 1
                    else:
                        self.inc['strict'] += 1
                        self.inc['exact'] += 1
                        if pred_URL in gold_URL:
                            self.par['partial'] += 1
                        else:
                            self.inc['partial'] += 1
                else:
                    self.inc['type'] += 1
                    self.inc['strict'] += 1
                    if pred_URL == gold_URL:
                        self.cor['exact'] += 1
                        self.cor['partial'] += 1
                    else:
                        self.inc['exact'] += 1
                        if pred_URL in gold_URL:
                            self.par['partial'] += 1
                        else:
                            self.inc['partial'] += 1
        pass
        
    def precision(self, mode='strict'):
        if mode == 'strict' or mode == 'exact':
            return self.cor[mode]/(self.cor[mode]+self.inc[mode]+self.par[mode]+self.mis[mode])
        else: 
            return (self.cor[mode]+ 0.5*self.par[mode])/(self.cor[mode]+self.inc[mode]+self.par[mode]+self.mis[mode])
        pass

    def recall(self, mode='strict'):
        if mode == 'strict' or mode == 'exact':
            return self.cor[mode]/(self.cor[mode]+self.inc[mode]+self.par[mode]+self.spr[mode])
        else: 
            return (self.cor[mode]+ 0.5*self.par[mode])/(self.cor[mode]+self.inc[mode]+self.par[mode]+self.spr[mode])
        pass

    def f1_score(self, mode='strict'):
        pass

In [12]:
eval = anno_eval(predict_gold_pairs)
# print(len(eval.pred_gold_pairs))
print(len(eval.mapped_pairs))
# pprint(eval.pred_gold_pairs)
# eval.mapped_pairs
eval.cal_muc_types()
pprint(eval.mis)
pprint(eval.cor)
pprint(eval.inc)
pprint(eval.par)
pprint(eval.spr)
print(eval.precision(mode='strict'), eval.recall(mode='strict'))
print(eval.precision(mode='exact'), eval.recall(mode='exact'))
print(eval.precision(mode='partial'), eval.recall(mode='partial'))
print(eval.precision(mode='type'), eval.recall(mode='type'))
# 'ab' in 'abc'

541
{'exact': 0, 'partial': 0, 'strict': 0, 'type': 0}
{'exact': 433, 'partial': 433, 'strict': 171, 'type': 184}
{'exact': 75, 'partial': 67, 'strict': 337, 'type': 324}
{'exact': 0, 'partial': 8, 'strict': 0, 'type': 0}
{'exact': 33, 'partial': 33, 'strict': 33, 'type': 33}
0.33661417322834647 0.31608133086876156
0.8523622047244095 0.800369685767098
0.860236220472441 0.8077634011090573
0.36220472440944884 0.34011090573012936


In [8]:
test_str = llm_output[-1]['answer']


import traceback


def parse_answer(ans_str):
    ans_str = ' '.join(ans_str.split())
    # print(ans_str)
    if 'input:' in ans_str:
        ans_str = ans_str.strip().split('input:')[-1]
    elif 'the input text:' in ans_str:
        ans_str = ans_str.strip().split('the input text:')[-1]
    elif 'the annotated URLs:' in ans_str:
        ans_str = ans_str.strip().split('the annotated URLs:')[-1]
    elif 'the input URL:' in ans_str:
        ans_str = ans_str.strip().split('the input URL:')[-1]
    elif 'the input source:' in ans_str:
        ans_str = ans_str.strip().split('the input source:')[-1]
    elif 'the input URLs:' in ans_str:
        ans_str = ans_str.strip().split('the input URLs:')[-1]
    elif 'the given URL:' in ans_str:
        ans_str = ans_str.strip().split('the given URL:')[-1]
    elif 'the given URLs:' in ans_str:
        ans_str = ans_str.strip().split('the given URLs:')[-1]
    elif 'the URLs you provided:' in ans_str:
        ans_str = ans_str.strip().split('the URLs you provided:')[-1]
    elif 'Output:' in ans_str:
        ans_str = ans_str.strip().split('Output:')[-1]
    elif 'output:' in ans_str:
        ans_str = ans_str.strip().split('output:')[-1]
    #ans_str = [split for split in ans_str if split != '']
    if 'Output:' in ans_str:
        ans_str = ans_str.strip().split('Output:')[-1] 
     
    if '[{' in ans_str:
        ans_str = ans_str.split('}]')[0] + '}]'
        if '*' in ans_str:
            ans_str = ans_str.split('*')[-1]
        ans_str = ans_str.strip()
        try:
            ans_str = ans_str.replace("\'", "\"")
                            
            ans_str = json.loads(ans_str)
        except Exception as e:
            # print(ans_str)
            # traceback.print_exc()
            print(e)
    # if '*' in ans_str: 
        # print(ans_str)
    return ans_str

    
parsed_outputs = []    
for i in range(0,len(llm_output)):
    test_str = llm_output[i]['answer']
    parsed_ans = parse_answer(test_str)
    if not isinstance(parsed_ans, str):
        parsed_outputs .append({'predict': parsed_ans, 'URL_gold_label':llm_output[i]['URL_gold_label']})
        # print(llm_output[i]['repoID'], 'Input:\n', llm_output[i]['text'])
        # print('Output:\n', parsed_ans)
        # print('\n')
print(len(parsed_outputs))
# pprint(parsed_outputs)
## 258 output markdown list format with *
## 928 contains a dict format, out of which 789 can be converted to json string 
## 15 


Expecting value: line 1 column 1 (char 0)
Expecting ',' delimiter: line 1 column 72 (char 71)
Expecting ',' delimiter: line 1 column 65 (char 64)
Unterminated string starting at: line 1 column 10 (char 9)
Expecting value: line 1 column 1 (char 0)
503


In [212]:
print(type(llm_output[5]['URL_gold_label']))
print(llm_output[5]['URL_gold_label'])

<class 'list'>
[{'URL': 'https://github.com/SIMEXP/glm_connectome/blob/master/real_data/MOTOR_pipeline_MSPC_regular_grid.m', 'gold_label': 'dataset_landing_page'}]


### Prompting for classification

In [None]:
ans_1 = []
for i in range(30, len(new_context_anno_df)):
    text = new_context_anno_df['context'].iloc[i]
    repoID = new_context_anno_df['repoID'].iloc[i]
    # print(new_context_anno_df['link'].iloc[i],new_context_anno_df['link_type'].iloc[i])
    url_type = [{'URL': url, 'gold_label':link_type} for url, link_type in zip(new_context_anno_df['link'].iloc[i],new_context_anno_df['link_type'].iloc[i])]
    # print(text)
    total_input = '{"context":"' +text + '",' + '"target_URLs": [' + ','.join([e['URL'] for e in url_type]) + ']}'  

    _prompt = """<s>[INST]<<sys>>You act as a human annotator. First read the instructions and given examples, then only annotate the last given input accordingly without extra words.<</sys>>
    
    Annotate the URLs given in the input with its context and classify the URLs with the following labels: 
    1. DatasetDirectLink - the URL is for downloading dataset files
    2. DatasetLandingPage - the URL is an introduction or a landing page for some dataset entity
    3. Software - when the URL is for some software entity
    4. Other - the URL does not fall into the above cases
    
    # formatting
    Input: target URL(s) with context 
    Output: for each URL span, first output the URL span, then output one of the four above labels.
    
    # Examples:
    # Example 1:
    Input: {"context": "Gowalla https://snap.stanford.edu/data/loc-gowalla.html : the pre-processed data that we used in the paper can be downloaded here http://dawenl.github.io/data/gowalla_pro.zip .","target_URLs": ['https://snap.stanford.edu/data/loc-gowalla.html', 'http://dawenl.github.io/data/gowalla_pro.zip']}
    Output: [{"URL": "https://snap.stanford.edu/data/loc-gowalla.html", "label":"dataset_landing_page"},
    {"URL": "http://dawenl.github.io/data/gowalla_pro.zip", "label": "dataset_direct_link"}]

    # Example 2:
    Input: {"context": "Next we suggest you look at the comprehensive tutorial http://simongog.github.io/assets/data/sdsl-slides/tutorial  which describes all major features of the library or look at some of the provided examples examples .","target_URLs": ["http://simongog.github.io/assets/data/sdsl-slides/tutorial"]}
    Output: [{"URL":"http://simongog.github.io/assets/data/sdsl-slides/tutorial", "label":"Other"}]

    # Example 3:
    Input: {"context": "Laboratory for Web Algorithms http://law.di.unimi.it/datasets.php", "target_URLs":"http://law.di.unimi.it/datasets.php"}
    Output: [{"URL": "http://law.di.unimi.it/datasets.php", "label":"dataset_landing_page"}]

    # Example 4:
    Input: {"context": "Validatable http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable : provides validations of email and password. It's optional and can be customized, so you're able to define your own validations.", "target_URLs":"http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable"}
    Output: [{"URL": "http://www.rubydoc.info/github/heartcombo/devise/main/Devise/Models/Validatable", "label":"Software"}]
    [/INST]
    [INST]
    # to annotate
    Input:""" + total_input+"\n[/INST]"
    res = run_prompt(_prompt)
    print(res.answer[len(_prompt):])
    ans_1.append({'text':text, 'answer': res.answer[len(_prompt):],'repoID':repoID, 'URL_gold_label':url_type})

  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_MSTEPS.m", "label": "Software"}]
  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_basc_regular_grid.m", "label": "Software"}]
  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/BLIND_pipeline_MSPC_MSTEPS.m", "label": "dataset_direct_link"}]
  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/BLIND_pipeline_MSPC_regular_grid.m", "label": "dataset_direct_link"}]
  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/MOTOR_pipeline_MSPC_MSTEPS.m", "label": "dataset_direct_link"}]
  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/MOTOR_pipeline_MSPC_regular_grid.m", "label": "Software"}]
  [{"URL": "https://github.com/SIMEXP/glm_connectome/blob/master/real_data/SCHIZO_pipeline_MSPC_MSTEPS.m", "label": "dataset_direct_link"}]
  [{"URL": "https://github.com/SIMEXP/g

In [29]:
# pprint(ans_1[:2])
print(len(ans_1))
size_of_ans = [len(e['answer']) for e in ans_1]
print(size_of_ans)
# pprint(ans[:])
short_ans = [e for e in ans_1 if len(e['answer'])<1000]
print(len(short_ans))
pprint(short_ans[-1:])

with open('./res/prompting_res/llama_2_static_fewshot_prompt_with_target_output_1.json', 'w') as fw:
    json.dump(ans_1, fw, indent=4,default=int)

with open('./res/prompting_res/llama_2_static_fewshot_prompt_with_target_short_output_1.json', 'w') as fw:
    json.dump(short_ans, fw, indent=4,default=int)

# with open('./res/prompting_res/llama_2_static_fewshot_prompt_output_2.json', 'w') as fw:
#    json.dump(llm_output, fw, indent=4,default=int)

1141
[128, 134, 138, 144, 138, 133, 139, 134, 137, 137, 138, 130, 130, 131, 100, 93, 95, 97, 110, 92, 113, 113, 107, 106, 93, 109, 70, 135, 107, 209, 92, 91, 96, 142, 152, 102, 115, 136, 98, 169, 91, 92, 81, 75, 105, 95, 81, 77, 102, 113, 76, 96, 92, 88, 77, 58837, 93, 112, 93, 118, 185, 104, 8, 96, 19575, 97, 116, 1, 92, 97, 77, 163, 115, 92, 74, 83, 68, 108, 145, 118, 118, 140, 171, 72, 82, 122, 108, 88, 85, 132, 126, 196, 169, 86, 94, 139, 104, 98, 227, 94, 128, 83, 20, 61, 107, 131, 148, 180, 231, 102, 102, 0, 80, 226, 157, 80, 68, 104, 121, 78, 117, 140, 20, 113, 104, 99, 109, 206, 96, 181, 139, 97, 78, 123, 110, 101, 86, 86, 89, 201, 100, 134, 116, 137, 187, 94, 79, 129, 95, 139, 142, 139, 126, 86, 94, 83, 119, 108, 108, 123, 75, 124, 388, 112, 112, 230, 154, 82, 143, 71, 135, 135, 83, 98, 1, 88, 114, 110, 91, 15, 108, 141, 62, 119, 145, 126, 124, 93, 121, 128, 139, 120, 85, 75, 172, 104, 115, 61, 87, 78, 78, 69, 128, 106, 78, 78, 86, 102, 194, 106, 116, 101, 85, 101, 173, 98, 11