# Test Implementation

John Giannini

27 September 2022


This code will test our similarity measurements on our dataset. For each PR that is a duplicate, it will rank other PRs in the same project by similarity and then record the position of the actual duplicate in that ranking in a new attribute, Match_Rank. 

The resultant data is outputted as a CSV and an XLSX. (This latter is for easy viewing, as Excel loads parts of the CSV incorectly.)

1. [Implementation](#1)
2. [Testing Alternate Implementations](#2)


<h3>Implementation</h3> <a name="2"></a>

In [1]:
# Mount drive
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/SEDS')

# Import libraries
import pandas as pd
import numpy as np

# Import our code
import code_change_similarity
import file_path_similarity

# Importing Dataset
dataset = pd.read_csv("/content/drive/MyDrive/SEDS/PullRequeset_cleaned.csv")

dataset.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Project,Pull_id,Title,Content,Files,Diff_info,Diff_content,Label,Duplicates
0,joomla/joomla-cms,6660.0,Undefined $model variable for postDeleteHook()...,$model is undefined at line 140 for $this->pos...,libraries/legacy/controller/admin.php,https://github.com/joomla/joomla-cms/pull/6660...,diff --git a/libraries/legacy/controller/admin...,1.0,7097.0
1,joomla/joomla-cms,3088.0,[#33256] downloaded vcard is empty,http://joomlacode.org/gf/project/joomla/tracke...,components/com_contact/views/contact/view.vcf.php,https://github.com/joomla/joomla-cms/pull/3088...,diff --git a/components/com_contact/views/cont...,1.0,3114.0
2,joomla/joomla-cms,3093.0,Bind Variable Fix Within PDO Driver,This fix is to revert back to original behavio...,libraries/joomla/database/driver/pdo.php,https://github.com/joomla/joomla-cms/pull/3093...,diff --git a/libraries/joomla/database/driver/...,1.0,3432.0
3,joomla/joomla-cms,5664.0,CAPTCHA and reCAPTCHA (Legacy) #5595,We were wrong to call it nocaptcha. We were to...,administrator/language/en-GB/en-GB.com_config....,https://github.com/joomla/joomla-cms/pull/5664...,diff --git a/administrator/language/en-GB/en-G...,1.0,5888.0
4,joomla/joomla-cms,2596.0,Fix a failing unit test [#32813],This should fix a unit test that is currently ...,tests/unit/suites/libraries/joomla/registry/fo...,https://github.com/joomla/joomla-cms/pull/2596...,diff --git a/tests/unit/suites/libraries/jooml...,1.0,2612.0


In [2]:
def combined_sim(a, b, data):
  # returns the overall similarity of rows a and b as a number between 0 and 1
  
  # file path similarity
  a_files = data.loc[a, 'Files']
  b_files = data.loc[b, 'Files']
  # make sure that these are strings (our dataset has a few blanks)
  if isinstance(a_files, str) and isinstance(b_files, str):
    file_path_sim = file_path_similarity.check_path_sim(a_files, b_files)
  else:
    # count a lack of listed files as 0 similarity
    file_path_sim = 0

  # deleted line similarity
  lista = code_change_similarity.find_del_lines(dataset.loc[a, 'Diff_content'])
  listb = code_change_similarity.find_del_lines(dataset.loc[b, 'Diff_content'])
  del_sim = code_change_similarity.del_sim(lista, listb)

  # weigh provided values
  combined = (file_path_sim + del_sim)/2
  # print(f'comp. {a} to {b}:\n\tfile_path_sim: {file_path_sim}\n\tdel_sim: {del_sim}\n\tcombined: {combined}') # for debugging
  return combined

def rank_matches(a, data):
  # Compares row a in data to other rows from the same project ranking similarity.
  # If a duplicates some other row, returns the place of that row in the ranking. If not returns NaN.

  # Loop through each row. Take measurements for other rows from same project
  similarities = []
  project = data.loc[a, 'Project']
  for b in range(data.shape[0]):
    # Only process rows in the same project besides row a
    if (not a == b) and (project == data.loc[b, 'Project']):
      # Stores a tuple to similarities: (row, degree of similarity)
      similarities.append( (b, combined_sim(a, b, data)) )
  
  # Sort the produced list of tuples
  similarities.sort(key = lambda x: x[1], reverse=True)
  
  # Find rank of matching entry in list
  id = data.loc[a, 'Pull_id']
  for i in range(len(similarities)):
    if id == data.loc[similarities[i][0], 'Duplicates']:
      # print(f'{a} matches {similarities[i][0]} in possition {i}') # for debugging
      return i
  
  # No match found in similarities list
  return np.nan

In [3]:
processed_dataset = dataset.copy()
processed_dataset.insert(9, 'Match_Rank', np.nan)

for n in range(processed_dataset.shape[0]):
  # Only process rows if it has a duplicate (otherwise the search is a waste)
  if not processed_dataset.loc[n, 'Duplicates'] == 0:
    print(f'processing row {n}')
    # Stores the match rank to Match_Rank
    processed_dataset.loc[n, 'Match_Rank'] = rank_matches(n, processed_dataset)

processed_dataset.to_csv('/content/drive/MyDrive/SEDS/PullRequeset_processed.csv',index_label=False,index=False)
processed_dataset.to_excel('/content/drive/MyDrive/SEDS/PullRequeset_processed.xlsx')

processing row 0
processing row 1
processing row 2
processing row 3
processing row 4
processing row 5
processing row 6
processing row 7
processing row 8
processing row 9
processing row 10
processing row 11
processing row 12
processing row 13
processing row 14
processing row 15
processing row 16
processing row 17
processing row 18
processing row 19
processing row 20
processing row 21
processing row 22
processing row 23
processing row 24
processing row 25
processing row 26
processing row 27
processing row 28
processing row 29
processing row 30
processing row 31
processing row 32
processing row 33
processing row 34
processing row 35
processing row 36
processing row 37
processing row 38
processing row 39
processing row 40
processing row 41
processing row 42
processing row 43
processing row 44
processing row 45
processing row 46
processing row 47
processing row 48
processing row 49
processing row 50
processing row 51
processing row 52
processing row 53
processing row 54
processing row 55
pr

<h3>Testing Alternate Implementations</h3> <a name="2"></a>

In [8]:
def rank_del_matches(a, data):
  # Compares row a in data to other rows from the same project ranking similarity.
  # If a duplicates some other row, returns the place of that row in the ranking. If not returns NaN.
  # This comparison is solely based on deleted line similarity.

  # Loop through each row. Take measurements for other rows from same project
  similarities = []
  lista = code_change_similarity.find_del_lines(data.loc[a, 'Diff_content'])
  project = data.loc[a, 'Project']
  for b in range(data.shape[0]):
    # Only process rows in the same project besides row a
    if (not a == b) and (project == data.loc[b, 'Project']):
      # Stores a tuple to similarities: (row, degree of similarity)
      listb = code_change_similarity.find_del_lines(data.loc[b, 'Diff_content'])
      similarities.append( (b, code_change_similarity.del_sim(lista, listb)) )

  # Sort the produced list of tuples
  similarities.sort(key = lambda x: x[1], reverse=True)
  
  # Find rank of matching entry in list
  id = data.loc[a, 'Pull_id']
  for i in range(len(similarities)):
    if id == data.loc[similarities[i][0], 'Duplicates']:
      # print(f'{a} matches {similarities[i][0]} in possition {i}') # for debugging
      return i
  
  # No match found in similarities list
  return np.nan



def rank_file_matches(a, data):
  # Compares row a in data to other rows from the same project ranking similarity.
  # If a duplicates some other row, returns the place of that row in the ranking. If not returns NaN.
  # This comparison is solely based on file path similarity.

  # Loop through each row. Take measurements for other rows from same project
  similarities = []
  a_files = data.loc[a, 'Files']
  project = data.loc[a, 'Project']
  for b in range(data.shape[0]):
    # Only process rows in the same project besides row a
    if (not a == b) and (project == data.loc[b, 'Project']):
      # Stores a tuple to similarities: (row, degree of similarity)
      b_files = data.loc[b, 'Files']
      # Must ensure that both are actually strings (they won't be if one is empty)
      if isinstance(a_files, str) and isinstance(b_files, str):
        similarities.append( (b, file_path_similarity.check_path_sim(a_files, b_files)) )
      else:
        similarities.append( (b, 0) )

  # Sort the produced list of tuples
  similarities.sort(key = lambda x: x[1], reverse=True)
  
  # Find rank of matching entry in list
  id = data.loc[a, 'Pull_id']
  for i in range(len(similarities)):
    if id == data.loc[similarities[i][0], 'Duplicates']:
      # print(f'{a} matches {similarities[i][0]} in possition {i}') # for debugging
      return i
  
  # No match found in similarities list
  return np.nan

In [9]:
processed_dataset_b = dataset.copy()
processed_dataset_b.insert(9, 'Match_Rank_File', np.nan)

for n in range(processed_dataset_b.shape[0]):
  # Only process rows if it has a duplicate (otherwise the search is a waste)
  if not processed_dataset_b.loc[n, 'Duplicates'] == 0:
    print(f'processing row {n}')
    # Stores the match rank to Match_Rank_File
    processed_dataset_b.loc[n, 'Match_Rank_File'] = rank_file_matches(n, processed_dataset_b)

processed_dataset_b.to_excel('/content/drive/MyDrive/SEDS/PullRequeset_processed_alt.xlsx')

processing row 0
processing row 1
processing row 2
processing row 3
processing row 4
processing row 5
processing row 6
processing row 7
processing row 8
processing row 9
processing row 10
processing row 11
processing row 12
processing row 13
processing row 14
processing row 15
processing row 16
processing row 17
processing row 18
processing row 19
processing row 20
processing row 21
processing row 22
processing row 23
processing row 24
processing row 25
processing row 26
processing row 27
processing row 28
processing row 29
processing row 30
processing row 31
processing row 32
processing row 33
processing row 34
processing row 35
processing row 36
processing row 37
processing row 38
processing row 39
processing row 40
processing row 41
processing row 42
processing row 43
processing row 44
processing row 45
processing row 46
processing row 47
processing row 48
processing row 49
processing row 50
processing row 51
processing row 52
processing row 53
processing row 54
processing row 55
pr