### Finding the main method of a java file and adding @Benchmark annotation

This program will iterate over every file in the directory and add @Benchmark tag over each function.

In [39]:
import os
import re

# Define the folder containing Java files
folder_path = r'C:\Users\Mettle\Desktop\Java-master\java-combinations\src\main\java\com\hmkcode'

# Create the 'benchmark_files' folder if it doesn't exist
if not os.path.exists('benchmark_files'):
    os.makedirs('benchmark_files')

# Function to process each Java file
def process_java_file(java_file_path):
    benchmark_file_path = os.path.join('benchmark_files', os.path.basename(java_file_path))
    add_benchmark_annotation(java_file_path, benchmark_file_path)
    add_counters(benchmark_file_path)
    method_invocation_annotation(benchmark_file_path)
   
# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.java'):
        file_path = os.path.join(folder_path, file_name)
        process_java_file(file_path)
        print(f"Completed annotation for file : {file_name} ")


Completed annotation for file : ForwardBackward.java 
Completed annotation for file : Recursive.java 
Completed annotation for file : Shifting.java 


#### Loop counter

*Currently not handling case with loops without braces.*

In [16]:
def add_counters(file_path):
    with open(file_path, 'r') as file:
            java_code = file.read()
    # Regex pattern for finding for loops and while loops
    pattern = r'(?P<loop>(for\s*\([^\)]*\)\s*\{)|(while\s*\([^\)]*\)\s*\{))'
    
    def add_counter(match):
        loop = match.group('loop')
        return loop + '\n    counter++;'
    
    # Replace each loop with the loop plus counter++
    modified_java_code = re.sub(pattern, add_counter, java_code)
    with open(file_path, 'w') as file:
        file.write(modified_java_code)

#### @Benchmark Annotation

In [17]:
def add_benchmark_annotation(file_path, benchmark_file_path):
    pattern = r'(?<!\/\/)(?<!\/\*)\b(?:public\s+|private\s+|protected\s+|static\s+|final\s+|native\s+|synchronized\s+|abstract\s+|transient\s+)*[\$_\w<>\[\]]*\s+\w+\s*\([^\)]*\)?\s*\{'

    with open(file_path, 'r') as file:
        java_code = file.read()

    # Find all occurrences of method declarations in the Java code
    matches = re.finditer(pattern, java_code)

    modified_java_code = ''
    previous_end_index = 0

    # Loop through each method occurrence
    for match in matches:
        start_index = match.start()
        end_index = match.end()
        
        method_declaration = java_code[start_index:end_index]
        
        # Insert @Benchmark annotation just above the method declaration
        modified_java_code += java_code[previous_end_index:start_index] + '@Benchmark\n' + method_declaration

        previous_end_index = end_index

    modified_java_code += java_code[previous_end_index:]
    with open(benchmark_file_path, 'w') as file:
        file.write(modified_java_code)

#### Count method invocation values

In [51]:
def method_invocation_annotation(file_path):
    method_pattern = r'(?P<method>(?<!\/\/)(?<!\/\*)\b(?:public\s+|private\s+|protected\s+|static\s+|final\s+|native\s+|synchronized\s+|abstract\s+|transient\s+)*[\$_\w<>\[\]]*\s+\w+\s*\([^\)]*\)\s*\{[^\}]*?\})'
    with open(file_path, 'r') as file:
        java_code = file.read()
    def add_counter(match):
        method = match.group()
        modified_method = method.replace('{', '{\n    method_counter++;\n')
        return modified_method

    # Replace each method with additional code added inside
    modified_java_code = re.sub(method_pattern, add_counter, java_code)
    with open(file_path, 'w') as file:
        file.write(modified_java_code)

# Getting Codeforces submissions

### Generate URL

In [149]:
import requests
import hashlib
import time
import random
def get_url(from_val):
    key = "baa5c566fa5bdeb92494876e5bcac06b6798d8fe"
    secret = "0d686bfc8a3c654a569b93e3d1d12803d25264f1"
    count = 1000
    current_time = int(time.time())
    random_number = random.randint(100000, 999999)
    method_fetched = "contest.status"
    string_to_hash = f"{random_number}/{method_fetched}?apiKey={key}&contestId=1928&count={count}&from={from_val}&time={current_time}#{secret}"
    sha512_hash = hashlib.sha512(string_to_hash.encode()).hexdigest()
    req_url = f"https://codeforces.com/api/{method_fetched}?apiKey={key}&contestId=1928&count={count}&from={from_val}&time={current_time}&apiSig={random_number}{sha512_hash}"
    return req_url

### Next step is to make a crawler to incrementally fetch data from Codeforces

In [150]:
import csv
import os.path
from os import path

# Function to check if file exists and create it if it doesn't
def create_csv_file(file_path):
    if not path.exists(file_path):
        with open(file_path, 'w', newline='') as file:
            # Create a CSV writer object
            csv_writer = csv.writer(file)
            # Write header row
            csv_writer.writerow(['id', 'contest_id', 'author','creation_time_seconds', 'relative_time_seconds', 'problem_name', 'problem_type', 'programming_language', 'verdict', 'test_set', 'passed_test_count', 'time_consumed_millis', 'memory_consumed_bytes'])

In [151]:
def append_to_csv(data, file_path):
    fieldnames = ['id', 'contest_id', 'author','creation_time_seconds', 'relative_time_seconds', 'problem_name', 'problem_type', 'programming_language', 'verdict', 'test_set', 'passed_test_count', 'time_consumed_millis', 'memory_consumed_bytes']
    
    file_exists = os.path.isfile(file_path)
    
    with open(file_path, 'a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()  # Write header only if file is new
        writer.writerow(data)

def make_request_and_append_to_csv(from_val, file_path):
    req_url = get_url(from_val)
    response = requests.get(req_url)
    if response.status_code == 200:
        data = response.json()
        for item in data['result']:
            csv_data = {
                'id': item['id'],
                'contest_id': item['contestId'],
                'author': item['author']['members'][0]['handle'],
                'creation_time_seconds': item['creationTimeSeconds'],
                'relative_time_seconds': item['relativeTimeSeconds'],
                'problem_name': item['problem']['name'],
                'problem_type': item['problem']['type'],
                'programming_language': item['programmingLanguage'],
                'verdict': item['verdict'],
                'test_set': item['testset'],
                'passed_test_count': item['passedTestCount'],
                'time_consumed_millis': item['timeConsumedMillis'],
                'memory_consumed_bytes': item['memoryConsumedBytes']
            }
            append_to_csv(csv_data, file_path)
    else:
        print("Request failed with status code:", response.status_code)

In [152]:
# Number of iterations
num_requests = 200
# Initial value to start 1.
from_val = 1

csv_file = 'output.csv'

create_csv_file(csv_file)

for i in range(num_requests):
    make_request_and_append_to_csv(from_val, 'output.csv')
    from_val += 1000  
    time.sleep(5)
    print(f"Completed: {i} / {num_requests} , Index value : {from_val}")

Completed: 0 / 200 , Index value : 1001
Completed: 1 / 200 , Index value : 2001
Completed: 2 / 200 , Index value : 3001
Completed: 3 / 200 , Index value : 4001
Completed: 4 / 200 , Index value : 5001
Completed: 5 / 200 , Index value : 6001
Completed: 6 / 200 , Index value : 7001
Completed: 7 / 200 , Index value : 8001
Completed: 8 / 200 , Index value : 9001
Completed: 9 / 200 , Index value : 10001
Completed: 10 / 200 , Index value : 11001
Completed: 11 / 200 , Index value : 12001
Completed: 12 / 200 , Index value : 13001
Completed: 13 / 200 , Index value : 14001
Completed: 14 / 200 , Index value : 15001
Completed: 15 / 200 , Index value : 16001
Completed: 16 / 200 , Index value : 17001
Completed: 17 / 200 , Index value : 18001
Completed: 18 / 200 , Index value : 19001
Completed: 19 / 200 , Index value : 20001
Completed: 20 / 200 , Index value : 21001
Completed: 21 / 200 , Index value : 22001
Completed: 22 / 200 , Index value : 23001
Completed: 23 / 200 , Index value : 24001
Completed: 

### We can now filter and find submissions of interest

**We've fetched all problems from contestid : 1928**

Each contest contains several problems. For this contest, there are the following problems : 

- **A : Rectangle Cutting** (20,553 users attempted)
- **B : Equalize** (14,643 users attempted)
- **C : Physical education lesson** (7066 users attempted)
- **D : Lonely Mountain Dungeons** (3588 users attempted)
- **E : Modular Sequence** (1418 users attempted)
- **F : Digital Patterns** (193 users attempted)
  
Now I filter by the problem and look for individuals that made multiple submissions for a given problem. In order to be considered, they must have a correct submissions (no compilation errors or wrong answer) and be written in C++ language.

In [None]:
import pandas as pd

# Adjust CSV reading parameters
df = pd.read_csv('output.csv')
# Remove entries with testset value 'WRONG_ANSWER'
df = df[df['verdict'] != 'WRONG_ANSWER']
df = df[df['verdict'] != 'COMPILATION_ERROR']
df = df[df['programming_language'].str.contains('C++')]
df = df[df['problem_name'] == 'Equalize']

#Remove authors that only submitted once.
author_counts = df['author'].value_counts()
multiple_submissions_authors = author_counts[author_counts > 1].index

# Filter the DataFrame to only keep elements where the author appears more than once
filtered_df = df[df['author'].isin(multiple_submissions_authors)]

#Retrieve submissions ids
id_values = filtered_df['id'].tolist()

### Now we fetch the code for all these submissions

               id  contest_id          author  creation_time_seconds  \
54      253261226        1928      KRISTY2006             1711394208   
118     253228787        1928     Rahil_gupta             1711377791   
125     253227477        1928     Rahil_gupta             1711377221   
132     253224124        1928      myWar_8707             1711375817   
134     253223003        1928      myWar_8707             1711375355   
...           ...         ...             ...                    ...   
102703  245800030        1928       shefaly77             1707644664   
102838  245799889        1928      shadow9236             1707644657   
102955  245799766        1928         _SwaRaj             1707644651   
103393  245799307        1928  PowerRanger123             1707644619   
104010  245798667        1928     User_Carrot             1707644569   

        relative_time_seconds problem_name problem_type programming_language  \
54                 2147483647     Equalize  PROGRAMMING