<a href="https://colab.research.google.com/github/navidadkhah/Fine-Tuning-LLMs/blob/main/Dataset/Create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirement libraries

In [1]:
!pip install mutmut
!pip install astor

Collecting mutmut
  Downloading mutmut-2.5.1.tar.gz (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pony (from mutmut)
  Downloading pony-0.7.19-py3-none-any.whl.metadata (2.8 kB)
Collecting junit-xml<2,>=1.8 (from mutmut)
  Downloading junit_xml-1.9-py2.py3-none-any.whl.metadata (3.2 kB)
Downloading junit_xml-1.9-py2.py3-none-any.whl (7.1 kB)
Downloading pony-0.7.19-py3-none-any.whl (317 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mutmut
  Building wheel for mutmut (setup.py) ... [?25l[?25hdone
  Created wheel for mutmut: filename=mutmut-2.5.1-py2.py3-none-any.whl size=31323 sha256=913aec61185b2fd9b

In [2]:
import requests
import numpy as np
import ast    # To convert code to AST
import astor  # To convert AST back to code
import random  # To select mutations randomly

# Mutanting code
In this section, we mutant code using the AST library. Changing operations in the code and randomly replace with AST operations.
<br>
In the output, we expect a mutant code and a line to tell us which line has changed.

In [None]:
# Define a function to mutate and print code
def mutate_code(code_str, num_mutations):
    # Parse the string into an AST
    tree = ast.parse(code_str)

    # List of possible binary operations for mutation
    operations = [ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Mod, ast.Pow,
                  ast.FloorDiv, ast.LShift, ast.RShift, ast.BitOr, ast.BitXor,
                  ast.BitAnd, ast.MatMult]

    # Split the original code into lines
    original_lines = code_str.strip().split('\n')

    for i in range(0, num_mutations):
        # Apply a random mutation to the AST
        for node in ast.walk(tree):
            if isinstance(node, ast.BinOp):
                # Randomly choose a new operator
                new_op = random.choice(operations)()
                node.op = new_op

        # Convert the mutated AST back to a string
        mutated_code = astor.to_source(tree).strip()
        mutated_lines = mutated_code.split('\n')

        # Print the mutated code
        print(f"\n--- Mutation {i + 1} ---")
        print(mutated_code)


        # Identify which lines have changed
        changed_lines = []
        line = 1;
        for original, mutated in zip(original_lines, mutated_lines):
            if original != mutated:
              print(f"Line {line} has changed")
              changed_lines.append(mutated)
            line += 1

        # Output changed lines
        if changed_lines:
            print("\n--- Changed Lines ---")
            for line in changed_lines:
                print(line)
        else:
            print("No changes detected.")

        # Reset the tree by re-parsing the original code
        tree = ast.parse(code_str)

ModuleNotFoundError: No module named 'astor'

In [None]:
# Example usage: Pass code as a string
original_code ="""
def add(a, b):
    if a > 0:
        return a - b
    else:
        return a + b
"""

mutate_code(original_code, 2)


--- Mutation 1 ---
def add(a, b):
    if a > 0:
        return a ** b
    else:
        return a ^ b
Line 3 has changed
Line 5 has changed

--- Changed Lines ---
        return a ** b
        return a ^ b

--- Mutation 2 ---
def add(a, b):
    if a > 0:
        return a ^ b
    else:
        return a - b
Line 3 has changed
Line 5 has changed

--- Changed Lines ---
        return a ^ b
        return a - b


# Create Dataset

In [13]:
# In this cell, we get datas from below url. datas contain number of available contests
# Usefull variables:
#     last_contest_id : keeps id of the last contest
#     number_of_contests : keeps number of FINISHED contests
#     contest_IDs : keeps the ID contests


url = "https://codeforces.com/api/contest.list"

response = requests.get(url)
data = response.json()

if data['status'] == "OK":
    contests = data['result']

    # Filter and print only Python submissions
    last_contest_id = 0
    not_started_yet = 0
    contest_IDs = []
    for contest in contests:
      if contest['phase'] == 'BEFORE':
        not_started_yet += 1
      else:
        if last_contest_id == 0:
          last_contest_id = contest['id']
        contest_IDs.append(contest['id'])


    number_of_contests = len(contests) - not_started_yet

    contest_IDs.sort()
    print(f"Number of available contests: {number_of_contests}\nID of the last contest: {last_contest_id}")

else:
    print("Error:", data['comment'])

Number of available contests: 1915
ID of the last contest: 2021


In [23]:
# Now, we use this Function to get the best users from contests
# Usefull variables
#     rated_list_handle : keeps all the best handles

def get_contest_users(contest_id, count, rated_list_handle):

    url = f"https://codeforces.com/api/contest.standings?contestId={contest_id}&from=1&count={count}"

    # Make the GET request to Codeforces API
    response = requests.get(url)
    data = response.json()

    # Check if the API call was successful
    if data['status'] == 'OK':
        standings = data['result']['rows']

        # Extract and print user handles
        for row in standings:
            handle = row['party']['members'][0]['handle']
            rank = row['rank']
            if handle not in rated_list_handle:
              rated_list_handle.append(handle)
            # print(f"Rank: {rank}, User: {handle}")
    else:
        print("Error:", data['comment'])

# Example: Get the first 10 users from contest 566
number_of_rated = 5
rated_list_handle = []
for index in range(len(contest_IDs), 0, -1):
    get_contest_users(index, number_of_rated, rated_list_handle)
print(f"Number of top rated contestor of all time: {len(rated_list_handle)}")
print(f"Some example:\n{rated_list_handle[0:10]}")

Error: contestId: Contest with id 1908 not found
Error: contestId: Contest with id 1897 not found
Error: contestId: Contest with id 1892 not found
Error: contestId: Contest with id 1880 not found
Error: contestId: Contest with id 1871 not found
Error: contestId: Contest with id 1865 not found
Error: contestId: Contest with id 1803 not found
Error: contestId: Contest with id 1757 not found
Error: contestId: Contest with id 1756 not found
Error: contestId: Contest with id 1745 not found
Error: contestId: Contest with id 1727 not found
Error: contestId: Contest with id 1683 not found
Error: contestId: Contest with id 1664 not found
Error: contestId: Contest with id 1655 not found
Error: contestId: Contest with id 1653 not found
Error: contestId: Contest with id 1645 not found
Error: contestId: Contest with id 1643 not found
Error: contestId: Contest with id 1640 not found
Error: contestId: Contest with id 1636 not found
Error: contestId: Contest with id 1597 not found
Error: contestId: Co

In [30]:
handle = "aid"
# API URL to get submissions for the user
url = f"https://codeforces.com/api/user.status?handle={handle}&from=1&count=10000"

# Make the GET request to Codeforces API
response = requests.get(url)
data = response.json()

# Check if the API call was successful
if data['status'] == 'OK':
    submissions = data['result']

    # Filter and print only Python submissions
    index = 1
    number_of_pythons = 0
    for submission in submissions:
        if 'Python' in submission['programmingLanguage'] and submission['verdict'] == "OK":  # Check if the language is Python
            submission_id = submission['id']
            contest_id = submission['contestId']
            problem_name = submission['problem']['name']
            programming_language = submission['programmingLanguage']
            verdict = submission['verdict']

            print(f"The index is {index}, Problem: {problem_name}, Language: {programming_language}, Verdict: {verdict}, Submission ID: {submission_id}")
            number_of_pythons += 1

        index += 1

    print(f"The total numbers: {number_of_pythons}")

else:
    print("Error:", data['comment'])


The index is 1635, Problem: Square Root, Language: Python 3, Verdict: OK, Submission ID: 20619124
The index is 2548, Problem: Cutting Puzzle, Language: Python 3, Verdict: OK, Submission ID: 9435796
The index is 3095, Problem: Манхеттенские улицы, Language: Python 3, Verdict: OK, Submission ID: 4701557
The index is 3096, Problem: Красивые последовательности, Language: Python 3, Verdict: OK, Submission ID: 4701510
The total numbers: 4


In [31]:
url = "https://codeforces.com/contest/2019/submission/284768286"

data = requests.get(url)
data

<Response [403]>

In [None]:

import requests

# Function to get users from a specific contest
def get_contest_users(contest_id, count):
    url = f"https://codeforces.com/api/user.ratedList?activeOnly=false"

    # Make the GET request to Codeforces API
    response = requests.get(url)
    data = response.json()

#     # Check if the API call was successful
#     if data['status'] == 'OK':
#         standings = data['result']['rows']

#         # Extract and print user handles
#         for row in standings:
#             handle = row['party']['members'][0]['handle']
#             rank = row['rank']
#             print(f"Rank: {rank}, User: {handle}")
#     else:
#         print("Error:", data['comment'])

# # Example: Get the first 10 users from contest 566
# get_contest_users(566, 20)

In [None]:
data

{'status': 'OK',
 'result': [{'id': 140500367,
   'contestId': 1615,
   'creationTimeSeconds': 1640364244,
   'relativeTimeSeconds': 7744,
   'problem': {'contestId': 1615,
    'index': 'F',
    'name': 'LEGOndary Grandmaster',
    'type': 'PROGRAMMING',
    'points': 3000.0,
    'rating': 2800,
    'tags': ['combinatorics', 'dp', 'math']},
   'author': {'contestId': 1615,
    'members': [{'handle': 'aid'}],
    'participantType': 'CONTESTANT',
    'ghost': False,
    'room': 487,
    'startTimeSeconds': 1640356500},
   'programmingLanguage': 'C++17 (GCC 9-64)',
   'verdict': 'OK',
   'testset': 'TESTS',
   'passedTestCount': 16,
   'timeConsumedMillis': 62,
   'memoryConsumedBytes': 64512000},
  {'id': 140492561,
   'contestId': 1615,
   'creationTimeSeconds': 1640362583,
   'relativeTimeSeconds': 6083,
   'problem': {'contestId': 1615,
    'index': 'E',
    'name': 'Purple Crayon',
    'type': 'PROGRAMMING',
    'points': 2750.0,
    'rating': 2400,
    'tags': ['data structures',
  