<a href="https://colab.research.google.com/github/navidadkhah/Fine-Tuning-LLMs/blob/main/Dataset/Create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirement libraries

In [1]:
!pip install mutmut
!pip install astor
!pip install libcst

Collecting mutmut
  Downloading mutmut-3.0.2-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting junit-xml==1.8 (from mutmut)
  Downloading junit-xml-1.8.tar.gz (10.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading mutmut-3.0.2-py2.py3-none-any.whl (20 kB)
Building wheels for collected packages: junit-xml
  Building wheel for junit-xml (setup.py) ... [?25l[?25hdone
  Created wheel for junit-xml: filename=junit_xml-1.8-py3-none-any.whl size=6762 sha256=08985b04c99e38c87cfddb4b0ae02424eaa22471720135386a6c1ae51c1d7917
  Stored in directory: /root/.cache/pip/wheels/9a/ce/b7/f14c277cea6ba09e5e53f89f1ff57a1dd43af9a17d7f065692
Successfully built junit-xml
Installing collected packages: junit-xml, mutmut
Successfully installed junit-xml-1.8 mutmut-3.0.2
Collecting astor
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: astor
Successfully installed astor-0.8.1
Collecting libcs

In [2]:
import numpy as np
import pandas as pd
import csv
import json
from collections.abc import Iterator
import ast    # To convert code to AST
import astor  # To convert AST back to code
import random  # To select mutations randomly
import libcst as cst
import libcst.matchers as m

# Reading Dataset

In [3]:
# Reading not completed dataset
path = '/content/drive/MyDrive/Bachelor\'s project/python_codes.csv'
df = pd.read_csv(path)
print(f"Number of Dataframe rows : {df.shape}")
df.head()

Number of Dataframe rows : (23187, 2)


Unnamed: 0,language,source
0,Python 3,"n, m = map(int, input().split())\na = []\nfor ..."
1,Python 3,"n, m = map(int, input().split())\na = []\nfor ..."
2,Python 3,#In the name of Allah\n\nfrom sys import stdin...
3,Python 3,"s = input()\nl, *v = (int(x) for x in input()...."
4,Python 3,"s = input()\nl, *v = (int(x) for x in input()...."


In [4]:
# Droping language column
df = df.drop('language', axis=1)
print(df.shape)
df.head()

(23187, 1)


Unnamed: 0,source
0,"n, m = map(int, input().split())\na = []\nfor ..."
1,"n, m = map(int, input().split())\na = []\nfor ..."
2,#In the name of Allah\n\nfrom sys import stdin...
3,"s = input()\nl, *v = (int(x) for x in input()...."
4,"s = input()\nl, *v = (int(x) for x in input()...."


In [5]:
# Removing codes cotain comments
def has_comments(code):
    return '#' in code

df = df[~df['source'].apply(has_comments)]
print(df.shape)
df = df.reset_index(drop=True)
df.head()

(17743, 1)


Unnamed: 0,source
0,"n, m = map(int, input().split())\na = []\nfor ..."
1,"n, m = map(int, input().split())\na = []\nfor ..."
2,"s = input()\nl, *v = (int(x) for x in input()...."
3,"s = input()\nl, *v = (int(x) for x in input()...."
4,"n = int(input())\na = list(map(int, input().sp..."


In [6]:
# Reducing the number of rows
desired_rows = 3000
selected_list = []
final_df = pd.DataFrame(columns=['source'])
while desired_rows != len(selected_list):
  rand_row = random.randrange(0, df.shape[0])
  if rand_row not in selected_list:
    selected_list.append(rand_row)
    final_df = pd.concat([final_df, pd.DataFrame(df['source'][rand_row:rand_row+1])])
df = final_df.reset_index(drop=True)
print(df.shape)
df.head()

(3000, 1)


Unnamed: 0,source
0,"x = int(input())\nhh, mm = map(int, input().sp..."
1,__author__ = 'pxy'\nimport math\nn=int(input()...
2,"import math\nimport time\n\nh,m = list(map(int..."
3,def normalize(login: str):\n return login.l...
4,from collections import deque\nimport re\npatt...


# Mutanting code
In this section, we use mutation technique code using the CST library. Changing operations in the code and randomly replace with CST operations.
## Why we are using CST?
We are using CST instead of AST because we want to save the format of the structure too and CST does it for us.
<br>
In the output, we expect a mutant code and a line to tell us which line has changed.

In [31]:
import random
import libcst as cst
from decimal import Decimal
import math

# Initialize the global variable for operations
flag = False
op_list = []

class OperationChanger(cst.CSTTransformer):
    select_operations = [
        cst.Add, cst.Subtract, cst.Multiply, cst.Divide,
        cst.Modulo, cst.Power, cst.FloorDivide,
        cst.LeftShift, cst.RightShift,
        cst.BitOr, cst.BitXor, cst.BitAnd,
        cst.MatrixMultiply
    ]

    operations = [
        cst.Add(), cst.Subtract(), cst.Multiply(), cst.Divide(),
        cst.Modulo(), cst.Power(), cst.FloorDivide(),
        cst.LeftShift(), cst.RightShift(),
        cst.BitOr(), cst.BitXor(), cst.BitAnd(),
        cst.MatrixMultiply()
    ]

    def __init__(self):
        super().__init__()
        self.changed = False  # Flag to track if a change has been made
        global flag
        flag = False  # Ensure global flag is initialized properly

    def leave_BinaryOperation(self, original_node, updated_node):
        global flag  # Declare that we're working with the global 'flag'
        if not self.changed:
            available_op_list = list(set(self.select_operations) - set(op_list))
            if len(available_op_list) == 0:
                available_op_list = self.select_operations
                op_list.clear()

            current_op = random.choice(available_op_list)
            op_list.append(current_op)

            new_op_to_change = random.choice(self.operations)
            if isinstance(updated_node.operator, current_op):
                self.changed = True
                flag = True
                return updated_node.with_changes(
                    operator=new_op_to_change.with_changes(
                        whitespace_before=cst.SimpleWhitespace(value=''),
                        whitespace_after=cst.SimpleWhitespace(value='')
                    )
                )
        return updated_node

# Define OperandCollector and SingleOperandModifier
class OperandCollector(cst.CSTVisitor):
    def __init__(self):
        self.operands = []

    def visit_Name(self, node):
        self.operands.append(node)

class SingleOperandModifier(cst.CSTTransformer):
    def __init__(self, operand_to_modify):
        self.operand_to_modify = operand_to_modify

    def leave_Name(self, original_node, updated_node):
        if original_node == self.operand_to_modify:
            new_name = random.choice(['x', 'y', 'z'])
            return updated_node.with_changes(value=new_name)
        return updated_node

# Define the NumberCollector and NumberModifier
class NumberCollector(cst.CSTVisitor):
    def __init__(self):
        self.numbers = []

    def visit_Integer(self, node):
        self.numbers.append(node)

    def visit_Float(self, node):
        self.numbers.append(node)

class NumberModifier(cst.CSTTransformer):
    def __init__(self, target_node):
        super().__init__()
        self.target_node = target_node

    def leave_Integer(self, original_node, updated_node):
        rand_num = random.randrange(1, 200)
        if original_node == self.target_node:
            return updated_node.with_changes(value=str(rand_num))
        return updated_node

    def leave_Float(self, original_node, updated_node):
        rand_num = random.randrange(1, 200)
        rand_float = random.randrange(1, 10)
        if original_node == self.target_node:
            return updated_node.with_changes(value=str(rand_num) + '.' + str(rand_float))
        return updated_node

# Define the ComparisonCollector and SingleComparisonModifier
class ComparisonCollector(cst.CSTVisitor):
    def __init__(self):
        self.comparisons = []

    def visit_Comparison(self, node):
        self.comparisons.append(node)

class SingleComparisonModifier(cst.CSTTransformer):
    def __init__(self, target_node):
        super().__init__()
        self.target_node = target_node

    def leave_Comparison(self, original_node, updated_node):
        if original_node == self.target_node:
            modified_ops = []
            for operator in updated_node.comparisons:
                if isinstance(operator.operator, cst.LessThan):
                    modified_ops.append(
                        operator.with_changes(
                            operator=cst.GreaterThan().with_changes(
                                whitespace_before=cst.SimpleWhitespace(value=''),
                                whitespace_after=cst.SimpleWhitespace(value='')
                            )
                        )
                    )
                elif isinstance(operator.operator, cst.GreaterThan):
                    modified_ops.append(
                        operator.with_changes(
                            operator=cst.Equal().with_changes(
                                whitespace_before=cst.SimpleWhitespace(value=''),
                                whitespace_after=cst.SimpleWhitespace(value='')
                            )
                        )
                    )
                elif isinstance(operator.operator, cst.Equal):
                    modified_ops.append(
                        operator.with_changes(
                            operator=cst.LessThan().with_changes(
                                whitespace_before=cst.SimpleWhitespace(value=''),
                                whitespace_after=cst.SimpleWhitespace(value='')
                            )
                        )
                    )
                elif isinstance(operator.operator, cst.GreaterThanEqual):
                    modified_ops.append(
                        operator.with_changes(
                            operator=cst.LessThanEqual().with_changes(
                                whitespace_before=cst.SimpleWhitespace(value=''),
                                whitespace_after=cst.SimpleWhitespace(value='')
                            )
                        )
                    )
                elif isinstance(operator.operator, cst.LessThanEqual):
                    modified_ops.append(
                        operator.with_changes(
                            operator=cst.GreaterThanEqual().with_changes(
                                whitespace_before=cst.SimpleWhitespace(value=''),
                                whitespace_after=cst.SimpleWhitespace(value='')
                            )
                        )
                    )
                else:
                    modified_ops.append(operator)
            return updated_node.with_changes(comparisons=modified_ops)
        return updated_node

# Function to randomly select a section and apply the corresponding modification
def randomly_modify_code(tree):
    sections = [
        'operation',
        'value',
        'decision',
        'statement'
    ]
    selected_section = random.choice(sections)

    if selected_section == 'operation':
        print("Selected section: Operation")
        while not flag:
            transformer = OperationChanger()
            modified_tree = tree.visit(transformer)
        return modified_tree

    elif selected_section == 'value':
        print("Selected section: Value")
        collector = NumberCollector()
        tree.visit(collector)
        if collector.numbers:
            target_node = random.choice(collector.numbers)
            transformer = NumberModifier(target_node)
            modified_tree = tree.visit(transformer)
            return modified_tree

    elif selected_section == 'decision':
        print("Selected section: Decision")
        collector = ComparisonCollector()
        tree.visit(collector)
        if collector.comparisons:
            target_node = random.choice(collector.comparisons)
            transformer = SingleComparisonModifier(target_node)
            modified_tree = tree.visit(transformer)
            return modified_tree

    elif selected_section == 'statement':
        print("Selected section: Statement")
        collector = OperandCollector()
        tree.visit(collector)
        if collector.operands:
            operand_to_modify = random.choice(collector.operands)
            modifier = SingleOperandModifier(operand_to_modify)
            modified_tree = tree.visit(modifier)
            return modified_tree

    return tree

# Function to modify and print the code
def modify_and_print_code(code):
    # Parse the input code into a CST tree
    tree = cst.parse_module(code)

    # Apply random modifications
    modified_tree = randomly_modify_code(tree)

    # Convert the modified tree back to a code string
    modified_code = modified_tree.code

    # Print the modified code
    print("Original Code:")
    print(code)
    print("\nModified Code:")
    return modified_code

In [33]:
def find_difference(original_code, modified_code):
  # Find witch line has changed
  changed_lines = []
  line = 1;
  for original, mutated in zip(original_code.split('\n'), modified_code.split('\n')):
      if original != mutated:
        print(f"Line {line} has changed")
        changed_lines.append(mutated)
      line += 1

  # Output changed lines
  if changed_lines:
      print("\n--- Changed Lines ---")
      for line in changed_lines:
          print(line)
  else:
      print("No changes detected.")

In [35]:
# Sample code
code = """
a,b=map(Decimal, input().split())
if b>=a:
    print (-1)
else:
    cur=-1
    if (a|b)%2==0.5:
        cur=(a+b)//2
    else:
        cur=(a+b)/Decimal(2)
    print(cur/math.floor(cur/b))
"""

modified_code = modify_and_print_code(code)
print(modified_code)
find_difference(code, modified_code)

Selected section: Operation
Original Code:

a,b=map(Decimal, input().split())
if b>=a:
    print (-1)
else:
    cur=-1
    if (a|b)%2==0.5:
        cur=(a+b)//2
    else:
        cur=(a+b)/Decimal(2)
    print(cur/math.floor(cur/b))


Modified Code:

a,b=map(Decimal, input().split())
if b>=a:
    print (-1)
else:
    cur=-1
    if (a|b)%2==0.5:
        cur=(a%b)//2
    else:
        cur=(a+b)/Decimal(2)
    print(cur/math.floor(cur/b))

Line 8 has changed

--- Changed Lines ---
        cur=(a%b)//2
