# Load data

In [None]:
import pandas as pd
from google.colab import files
from google.colab import drive
import csv
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# NEW: for dupls_and_non_dupls_100k.csv
!pip install gdown
!gdown 1vUcOEfKh8EkFqWDa7Uq3bZFdeC1IgHuD

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading...
From: https://drive.google.com/uc?id=1vUcOEfKh8EkFqWDa7Uq3bZFdeC1IgHuD
To: /content/dupls_and_non_dupls_100k.csv
100% 244M/244M [00:02<00:00, 94.2MB/s]


In [None]:
### IMPORT SET OF DUPLS FROM CSV

non_dupls_file_name = "https://raw.githubusercontent.com/natalievolk/StackOverflowDuplicateDetection/main/data/csv/duplicate_posts_50000_with_names.csv"
random_posts_file_name = "https://raw.githubusercontent.com/natalievolk/StackOverflowDuplicateDetection/triplets/data/RandomPosts50k.csv"

df_non_dupls = pd.read_csv(non_dupls_file_name, on_bad_lines='skip')
df_randoms = pd.read_csv(random_posts_file_name, on_bad_lines='skip')

# combine dfs into triplets
  # 1: anchor
  # 2: positive (duplicate)
  # 3: negative (random other post)
df = pd.concat([df_non_dupls, df_randoms], axis=1)

In [None]:
print(df.head(1))

     Id1                                             Title1  \
0  18418  Elegant way to remove items from sequence in P...   

                                               Body1      Id2  \
0  <p>When I am writing code in Python, I often n...  1207406   

                                             Title2  \
0  How to remove items from a list while iterating?   

                                               Body2  \
0  <p>I'm iterating over a list of tuples in Pyth...   

                                 Tags1                Tags2      Id3  \
0  <python><optimization><set><series>  <python><iteration>  2291455   

              Title3                                              Body3  \
0  Fuzzy match in C#  <p>Does C# has its own library for Fuzzy match...   

        Tags3  
0  <c#><.net>  


In [None]:
### for 100k file
with open('/content/drive/My Drive/dupls_and_non_dupls_100k.csv','r') as csvfile:
    data_reader = csv.reader(csvfile)

    data = []
    for row in data_reader:
        data.append(row)

df = pd.DataFrame(data)

In [None]:
df.shape

(100001, 10)

In [None]:
df.rename(columns={1: 'Id1', 2: 'Title1', 3: 'Body1', 4: 'Tags1', 5: 'Id2', 6: 'Title2', 7: 'Body2', 8: 'Tags2', 9: "Label"}, inplace=True)
df = df.iloc[1: , 1:]

In [None]:
df.head(3)

Unnamed: 0,Id1,Title1,Body1,Tags1,Id2,Title2,Body2,Tags2,Label
1,75780318.0,Reason about auxuliary constructors in scala,<p>I have class methods to define auxiliary co...,<java><scala>,28577,Globalization architecture,<p>I need to store products for an e-commerce ...,<c#><architecture><localization><globalization>,0
2,75780300.0,field not getting focus js/html,<pre><code> &lt;td&gt;&lt;input class=&...,<javascript><html>,37721077,Multitrigger does not work,<p>I have a problem with my MultiTrigger and I...,<c#><xaml><multitrigger>,0
3,75780298.0,set the Python version of a venv,<p>I have installed a specific version of Pyth...,<python><pyenv>,28932,Best JavaScript compressor,<p>What is the the best JavaScript compressor ...,<javascript><compression>,0


# Replace all numbers in the body with zeroes


In [None]:
def replace_nums(df, col_name):
  df[col_name] = df[col_name].replace(to_replace=r'\d+', value='0', regex=True)

  return df

# Parse HTML in body and separate text from body

In [None]:
!pip install bs4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l[?25hdone
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1270 sha256=fc6e6810e6cbd07b8299a63588d31239aba4a661f78af7b210f122306175becf
  Stored in directory: /root/.cache/pip/wheels/73/2b/cb/099980278a0c9a3e57ff1a89875ec07bfa0b6fcbebb9a8cad3
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [None]:
from bs4 import BeautifulSoup

In [None]:
def remove_duplicate_blockquote(body_html_1, body_html_2):
  soup1 = BeautifulSoup(body_html_1, "html.parser")

  blockquotes = soup1.find_all("blockquote")

  # remove the blockquote element and its contents
  for bq in blockquotes:
    if "Duplicate" in bq:
      bq.decompose()

  body_html_1 = str(soup1.table)


  soup2 = BeautifulSoup(body_html_2, "html.parser")

  blockquotes = soup2.find_all("blockquote")

  # remove the blockquote element and its contents
  for bq in blockquotes:
    if "Duplicate" in bq:
      bq.decompose()

  body_html_2 = str(soup2.table)

  return pd.Series({"Body1": soup1, "Body2": soup2})

In [None]:
def separate_code_text(body1, body2, body3=None):
  '''
  - removes any indicators of duplicate posts (ie, blockquote with duplicate message)
  - separates code from text
  - removes all html tags

  returns: text, code for each of the body inputs
  '''

  ### PARSE BODY1
  soup = BeautifulSoup(body1, "html.parser")

  blockquotes = soup.find_all("blockquote")

  # remove the duplicate blockquote elements and its contents
  for bq in blockquotes:
    if "Duplicate" in bq:
      bq.decompose()
      break

  # separate code blocks
  code_blocks = soup.find_all("code")
  code = []
  for cb in code_blocks:
      code.append(cb.get_text())
      cb.decompose()

  text1 = soup.get_text()
  code1 = "\n".join(code)


  ### PARSE BODY2
  soup = BeautifulSoup(body2, "html.parser")

  blockquotes = soup.find_all("blockquote")

  # remove the blockquote element and its contents
  for bq in blockquotes:
    if "Duplicate" in bq:
      bq.decompose()
      break

  code_blocks = soup.find_all("code")

  code = []
  for cb in code_blocks:
      code.append(cb.get_text())
      cb.decompose()

  text2 = soup.get_text()
  code2 = "\n".join(code)

  if body3 is not None:
      ### PARSE BODY3
      soup = BeautifulSoup(body3, "html.parser")

      blockquotes = soup.find_all("blockquote")

      # remove the blockquote element and its contents
      for bq in blockquotes:
        if "Duplicate" in bq:
          bq.decompose()
          break

      code_blocks = soup.find_all("code")
      
      code = []
      for cb in code_blocks:
          code.append(cb.get_text())
          cb.decompose()

      text3 = soup.get_text()
      code3 = "\n".join(code)

      return pd.Series({"BodyText1": text1, "BodyCode1": code1, "BodyText2": text2, "BodyCode2": code2, "BodyText3": text3, "BodyCode3": code3})


  return pd.Series({"BodyText1": text1, "BodyCode1": code1, "BodyText2": text2, "BodyCode2": code2})


In [None]:
df[["BodyText1", "BodyCode1", "BodyText2", "BodyCode2"]] = df.apply(lambda x: separate_code_text(x['Body1'], x['Body2']), axis=1)
df.drop(['Body1', 'Body2'], axis=1, inplace=True)

In [None]:
# # process bodies into html-free code and text
# df[["BodyText1", "BodyCode1", "BodyText2", "BodyCode2", "BodyText3", "BodyCode3"]] = df.apply(lambda x: separate_code_text(x['Body1'], x['Body2'], x['Body3']), axis=1)

# # remove columns for old unparsed body attributes
# df.drop(['Body1', 'Body2', 'Body3'], axis=1, inplace=True)

In [None]:
df.head(2)

Unnamed: 0,Id1,Title1,Id2,Title2,Tags1,Tags2,Id3,Title3,Tags3,BodyText1,BodyCode1,BodyText2,BodyCode2,BodyText3,BodyCode3,TagSimilarity1,TagSimilarity2
0,18418,Elegant way to remove items from sequence in P...,1207406,How to remove items from a list while iterating?,<python><optimization><set><series>,<python><iteration>,2291455,Fuzzy match in C#,<c#><.net>,"When I am writing code in Python, I often need...",for name in names:\n if name[-5:] == 'Smith...,"I'm iterating over a list of tuples in Python,...",for tup in somelist:\n if determine(tup):\n...,Does C# has its own library for Fuzzy match(Fu...,,0.5,0.25
1,49146,How can I make an EXE file from a Python program?,2933,Create a directly-executable cross-platform GU...,<python><exe><executable>,<python><user-interface><deployment><tkinter><...,58647800,How to stop search after finding the first des...,<python><regex>,I've used several modules to make EXEs for Pyt...,,Python works on multiple platforms and can be ...,,I've a string having and want to get an outpu...,CUSTOMER_SEGMENT_PRIV\nCUSTOMER_SEGMENT equal ...,0.2,0.333333


In [None]:
# write df to csv
path = '/content/drive/My Drive/processed_dupls_and_non_dupls_100k.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  df.to_csv(f)
# df.to_csv("triplet_data.csv")

In [None]:
from google.colab import files
files.download('triplet_data.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def remove_html(html_text_1, html_text_2):
    soup = BeautifulSoup(html_text_1, "html.parser")
    text1 = soup.get_text()

    soup = BeautifulSoup(html_text_2, "html.parser")
    text2 = soup.get_text()

    return pd.Series({"BodyText1": text1, "BodyText2": text2})

In [None]:
## this is unncessary actually
# df_non_dupls[["BodyText1", "BodyText2"]] = df_non_dupls.apply(lambda x: remove_html(x['BodyText1'], x['BodyText2']), axis=1)

# **Find posts with similar tags**
- posts that have at least 50% of tags in Post 1 are also in Post 2 AND vice versa,

In [None]:
def tag_similarity(tags1, tags2):
  '''
  tags1, tags2 -> both are strings formatted like "<tagA><tagB><etc>"
  '''
  # return 0% similarity if one of the tags values are null
  if not tags1 or not tags2:
    return 0
  
  # remove opening + closing < >
  tags1, tags2 = tags1[1:-1], tags2[1:-1]

  set1 = set(tags1.split("><"))
  set2 = set(tags2.split("><"))

  sim1, sim2 = len(set1 & set2)/len(set2), len(set1 & set2)/len(set1)

  return pd.Series({"Similarity1": sim1, "Similarity2": sim2})

In [None]:
THRESHOLD = 0.5

df_similar_tags = df[(df['Similarity1'] > THRESHOLD) & (df['Similarity2'] > THRESHOLD)]

NameError: ignored

In [None]:
df[["TagSimilarity1", "TagSimilarity2"]] = df.apply(lambda x: tag_similarity(x['Tags1'], x['Tags2']), axis=1)
print(df)

In [None]:
THRESHOLD = 0.5

df_similar_tags = df[(df['TagSimilarity1'] > THRESHOLD) & (df['TagSimilarity2'] > THRESHOLD)]
df_similar_tags

**Note:** here we used a threshold of 50%. Reduced 50,000 rows to 13,848 rows.

# Find Posts With Similar Code Using CodeBERT

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [None]:
# generate natural language and code tokens
nl_tokens = tokenizer.tokenize("return maximum value")
code_tokens = tokenizer.tokenize("def max(a,b): if a>b: return a else return b")

In [None]:
nl_tokens

['return', 'Ġmaximum', 'Ġvalue']

In [None]:

code_tokens

['def',
 'Ġmax',
 '(',
 'a',
 ',',
 'b',
 '):',
 'Ġif',
 'Ġa',
 '>',
 'b',
 ':',
 'Ġreturn',
 'Ġa',
 'Ġelse',
 'Ġreturn',
 'Ġb']

In [None]:
# combine NL and code tokens by joining them with a sep_token
tokens = [tokenizer.cls_token] + nl_tokens + [tokenizer.sep_token] + code_tokens + [tokenizer.eos_token]
# convert to integers
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)

# generate context embeddings
context_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

In [None]:
print(context_embeddings.shape)
print(context_embeddings)

torch.Size([1, 23, 768])
tensor([[[-0.1423,  0.3766,  0.0443,  ..., -0.2513, -0.3099,  0.3183],
         [-0.5739,  0.1333,  0.2314,  ..., -0.1240, -0.1219,  0.2033],
         [-0.1579,  0.1335,  0.0291,  ...,  0.2340, -0.8801,  0.6216],
         ...,
         [-0.4042,  0.2284,  0.5241,  ..., -0.2046, -0.2419,  0.7031],
         [-0.3894,  0.4603,  0.4797,  ..., -0.3335, -0.6049,  0.4730],
         [-0.1433,  0.3785,  0.0450,  ..., -0.2527, -0.3121,  0.3207]]],
       grad_fn=<NativeLayerNormBackward0>)


In [None]:
def generate_embeddings(nl_snippet, code_snippet):
    '''
    nl_snippet: str is a string of preprocessed sentences
    code_snippet: str is a string of preprocessed code. Should have new lines and tabs eliminate
    and only spaces separating lines
    '''
    nl_tokens = tokenizer.tokenize(nl_snippet)
    code_tokens = tokenizer.tokenize(code_snippet)

    # combine NL and code tokens by joining them with a sep_token
    tokens = [tokenizer.cls_token] + nl_tokens + [tokenizer.sep_token] + code_tokens + [tokenizer.eos_token]
    # convert to integers
    tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
    print(torch.tensor(tokens_ids)[None, :])
    # generate context embeddings
    # expand the dimension, then pass into model
    context_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    return context_embeddings

In [None]:
nl_snippet = "print hello world"
code_snippet = "def f(): print('hello world')"
generate_embeddings(nl_snippet, code_snippet)

tensor([[    0, 17265, 20760,   232,     2,  9232,   856, 49536,  5780, 45803,
         42891,   232, 27645,     2]])


tensor([[[-0.1555,  0.3123,  0.1191,  ..., -0.1754, -0.3912,  0.3411],
         [-0.2077, -0.2204,  0.2502,  ..., -0.0568, -0.4406,  0.7089],
         [-0.4981,  0.1659,  0.1874,  ..., -0.0539, -0.4445,  0.6419],
         ...,
         [-0.1161,  0.1250,  0.2387,  ..., -0.3774, -0.5434,  0.5439],
         [-0.4366,  0.2959,  0.0358,  ..., -1.1059, -0.2204,  0.8541],
         [-0.1553,  0.3132,  0.1199,  ..., -0.1752, -0.3921,  0.3417]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125942 sha256=519a104407fda519422e85333f7ebd0dd5fc8f3dad66b3636ec04d35c18162c2
  Stored in directory: /root/.cache/pip/wheels/71/67/06/162a3760c40d7

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
print(embeddings1.shape)
#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

torch.Size([3, 384])
The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
