### Finding the main method of a java file and adding @Benchmark annotation

This program will iterate over every file in the directory and add @Benchmark tag over each function.

In [39]:
import os
import re

# Define the folder containing Java files
folder_path = r'C:\Users\Mettle\Desktop\Java-master\java-combinations\src\main\java\com\hmkcode'

# Create the 'benchmark_files' folder if it doesn't exist
if not os.path.exists('benchmark_files'):
    os.makedirs('benchmark_files')

# Function to process each Java file
def process_java_file(java_file_path):
    benchmark_file_path = os.path.join('benchmark_files', os.path.basename(java_file_path))
    add_benchmark_annotation(java_file_path, benchmark_file_path)
    add_counters(benchmark_file_path)
    method_invocation_annotation(benchmark_file_path)
   
# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.java'):
        file_path = os.path.join(folder_path, file_name)
        process_java_file(file_path)
        print(f"Completed annotation for file : {file_name} ")


Completed annotation for file : ForwardBackward.java 
Completed annotation for file : Recursive.java 
Completed annotation for file : Shifting.java 


#### Loop counter

*Currently not handling case with loops without braces.*

In [16]:
def add_counters(file_path):
    with open(file_path, 'r') as file:
            java_code = file.read()
    # Regex pattern for finding for loops and while loops
    pattern = r'(?P<loop>(for\s*\([^\)]*\)\s*\{)|(while\s*\([^\)]*\)\s*\{))'
    
    def add_counter(match):
        loop = match.group('loop')
        return loop + '\n    counter++;'
    
    # Replace each loop with the loop plus counter++
    modified_java_code = re.sub(pattern, add_counter, java_code)
    with open(file_path, 'w') as file:
        file.write(modified_java_code)

#### @Benchmark Annotation

In [17]:
def add_benchmark_annotation(file_path, benchmark_file_path):
    pattern = r'(?<!\/\/)(?<!\/\*)\b(?:public\s+|private\s+|protected\s+|static\s+|final\s+|native\s+|synchronized\s+|abstract\s+|transient\s+)*[\$_\w<>\[\]]*\s+\w+\s*\([^\)]*\)?\s*\{'

    with open(file_path, 'r') as file:
        java_code = file.read()

    # Find all occurrences of method declarations in the Java code
    matches = re.finditer(pattern, java_code)

    modified_java_code = ''
    previous_end_index = 0

    # Loop through each method occurrence
    for match in matches:
        start_index = match.start()
        end_index = match.end()
        
        method_declaration = java_code[start_index:end_index]
        
        # Insert @Benchmark annotation just above the method declaration
        modified_java_code += java_code[previous_end_index:start_index] + '@Benchmark\n' + method_declaration

        previous_end_index = end_index

    modified_java_code += java_code[previous_end_index:]
    with open(benchmark_file_path, 'w') as file:
        file.write(modified_java_code)

#### Count method invocation values

In [51]:
def method_invocation_annotation(file_path):
    method_pattern = r'(?P<method>(?<!\/\/)(?<!\/\*)\b(?:public\s+|private\s+|protected\s+|static\s+|final\s+|native\s+|synchronized\s+|abstract\s+|transient\s+)*[\$_\w<>\[\]]*\s+\w+\s*\([^\)]*\)\s*\{[^\}]*?\})'
    with open(file_path, 'r') as file:
        java_code = file.read()
    def add_counter(match):
        method = match.group()
        modified_method = method.replace('{', '{\n    method_counter++;\n')
        return modified_method

    # Replace each method with additional code added inside
    modified_java_code = re.sub(method_pattern, add_counter, java_code)
    with open(file_path, 'w') as file:
        file.write(modified_java_code)

# Getting Codeforces submissions

### Generate URL

In [3]:
import requests
import hashlib
import time
import random
from bs4 import BeautifulSoup
def get_url(from_val):
    key = "baa5c566fa5bdeb92494876e5bcac06b6798d8fe"
    secret = "0d686bfc8a3c654a569b93e3d1d12803d25264f1"
    count = 1000
    current_time = int(time.time())
    random_number = random.randint(100000, 999999)
    method_fetched = "contest.status"
    string_to_hash = f"{random_number}/{method_fetched}?apiKey={key}&contestId=1928&count={count}&from={from_val}&time={current_time}#{secret}"
    sha512_hash = hashlib.sha512(string_to_hash.encode()).hexdigest()
    req_url = f"https://codeforces.com/api/{method_fetched}?apiKey={key}&contestId=1928&count={count}&from={from_val}&time={current_time}&apiSig={random_number}{sha512_hash}"
    return req_url

### Next step is to make a crawler to incrementally fetch data from Codeforces

In [4]:
import csv
import os.path
from os import path

# Function to check if file exists and create it if it doesn't
def create_csv_file(file_path):
    if not path.exists(file_path):
        with open(file_path, 'w', newline='') as file:
            # Create a CSV writer object
            csv_writer = csv.writer(file)
            # Write header row
            csv_writer.writerow(['id', 'contest_id', 'author','creation_time_seconds', 'relative_time_seconds', 'problem_name', 'problem_type', 'programming_language', 'verdict', 'test_set', 'passed_test_count', 'time_consumed_millis', 'memory_consumed_bytes'])

In [151]:
def append_to_csv(data, file_path):
    fieldnames = ['id', 'contest_id', 'author','creation_time_seconds', 'relative_time_seconds', 'problem_name', 'problem_type', 'programming_language', 'verdict', 'test_set', 'passed_test_count', 'time_consumed_millis', 'memory_consumed_bytes']
    
    file_exists = os.path.isfile(file_path)
    
    with open(file_path, 'a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()  # Write header only if file is new
        writer.writerow(data)

def make_request_and_append_to_csv(from_val, file_path):
    req_url = get_url(from_val)
    response = requests.get(req_url)
    if response.status_code == 200:
        data = response.json()
        for item in data['result']:
            csv_data = {
                'id': item['id'],
                'contest_id': item['contestId'],
                'author': item['author']['members'][0]['handle'],
                'creation_time_seconds': item['creationTimeSeconds'],
                'relative_time_seconds': item['relativeTimeSeconds'],
                'problem_name': item['problem']['name'],
                'problem_type': item['problem']['type'],
                'programming_language': item['programmingLanguage'],
                'verdict': item['verdict'],
                'test_set': item['testset'],
                'passed_test_count': item['passedTestCount'],
                'time_consumed_millis': item['timeConsumedMillis'],
                'memory_consumed_bytes': item['memoryConsumedBytes']
            }
            append_to_csv(csv_data, file_path)
    else:
        print("Request failed with status code:", response.status_code)

In [None]:
# Number of iterations
num_requests = 200
# Initial value to start 1.
from_val = 1

csv_file = 'output.csv'

create_csv_file(csv_file)

for i in range(num_requests):
    make_request_and_append_to_csv(from_val, 'output.csv')
    from_val += 1000  
    time.sleep(5)
    print(f"Completed: {i} / {num_requests} , Index value : {from_val}")

### We can now filter and find submissions of interest

**We've fetched all problems from contestid : 1928**

Each contest contains several problems. For this contest, there are the following problems : 

- **A : Rectangle Cutting** (20,553 users attempted)
- **B : Equalize** (14,643 users attempted)
- **C : Physical education lesson** (7066 users attempted)
- **D : Lonely Mountain Dungeons** (3588 users attempted)
- **E : Modular Sequence** (1418 users attempted)
- **F : Digital Patterns** (193 users attempted)
  
Now I filter by the problem and look for individuals that made multiple submissions for a given problem. In order to be considered, they must have a correct submissions (no compilation errors or wrong answer) and be written in C++ language.

In [16]:
import pandas as pd

# Adjust CSV reading parameters
df = pd.read_csv('output.csv')
# Remove entries with testset value 'WRONG_ANSWER'
df = df[df['verdict'] != 'WRONG_ANSWER']
df = df[df['verdict'] != 'COMPILATION_ERROR']
df = df[df['programming_language'].str.contains('C++')]
df = df[df['problem_name'] == 'Equalize']

#Remove authors that only submitted once.
author_counts = df['author'].value_counts()
multiple_submissions_authors = author_counts[author_counts > 1].index

# Filter the DataFrame to only keep elements where the author appears more than once
filtered_df = df[df['author'].isin(multiple_submissions_authors)]

filtered_df.to_csv('filtered_output.csv', index=False)

#Retrieve submissions ids
id_values = filtered_df['id'].tolist()

### Now we fetch the code for all these submissions

To fetch the submissions, we create a webscraper. The website's DDoS protection eventually kicks in so we implement several strategies : 
- Randomized access times [11, 20] seconds
- Randomized user agent property to make it seem like multiple users are accessing from same IP.
- Selenium web driver rather than requests. This is necessary to load Javascript and prevent detection, as requests does not load Javascript and therefore makes it easy to detect. 
- Rotating proxy addresses

In [None]:
!pip install selenium
!pip install random_user_agent
!pip install lxml
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import os
import time
import random
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem
from selenium.webdriver.common.proxy import Proxy

This fetches a list of usable proxy addresses, updated every ten minutes. We can execute this function every ten minutes and fetch all the new addresses.

In [34]:
def fetch_proxy_urls(driver):
    driver.get('https://www.sslproxies.org/')
    
    # Extract the HTML content of the page
    page_source = driver.page_source

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the table containing proxy information
    proxy_table = soup.find('table', class_='table')

    # Extract IP addresses and ports from the table rows
    proxy_server_urls = []
    if proxy_table:
        rows = proxy_table.find_all('tr')
        for row in rows[1:]:  # Skip the header row
            columns = row.find_all('td')
            ip_address = columns[0].text
            port = columns[1].text
            proxy_url = f"{ip_address}:{port}"
            proxy_server_urls.append(proxy_url)
    driver.quit()
    return proxy_server_urls


In [None]:
# Define the directory to store code files
directory = "code_files"
if not os.path.exists(directory):
    os.makedirs(directory)

# Initialize UserAgent object
software_names = [SoftwareName.EDGE.value, SoftwareName.CHROME.value, SoftwareName.CHROMIUM.value, SoftwareName.ANDROID.value, SoftwareName.FIREFOX.value, SoftwareName.OPERA.value, SoftwareName.SAFARI.value]
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value, OperatingSystem.MAC.value]
user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100)

#chrome_options = Options()
#chrome_options.add_argument(f'user-agent={user_agent}')
#driver = webdriver.Chrome(options=chrome_options)
#proxy_server_urls = fetch_proxy_urls(driver)
count = 0
# Iterate over each submission ID
for submission_id in id_values:
    try:
        # Randomizing sleep duration (Must be greater than 10 to prevent DDOS protection from kicking in)
        #if count % 35 == 0:
        #    proxy_server_urls = fetch_proxy_urls(driver)
     
        # URL of the submission page
        submission_url = f"https://codeforces.com/contest/1928/submission/{submission_id}"

        # Set a random User-Agent for each request
        user_agent = user_agent_rotator.get_random_user_agent()
          
        # Set up Chrome options
        chrome_options = Options()

        # Set proxy server URL
        #PROXY = proxy_server_urls[random.randint(0, len(proxy_server_urls) - 1)]
        #print(PROXY)
        # Add user agent, incognito mode, and headless mode to Chrome options
        chrome_options.add_argument(f'user-agent={user_agent}')
        chrome_options.add_argument("--incognito")
        chrome_options.add_argument("--headless")

        # Set proxy server for Chrome WebDriver
        #chrome_options.add_argument("--proxy-server=%s" % PROXY)

        # Initialize Chrome WebDriver with options
        driver = webdriver.Chrome(options=chrome_options)

        # Navigate to the submission page
        driver.get(submission_url)
        
        # Check the response status code
        response_code = driver.execute_script("return document.documentElement.outerHTML").split('\n')[0].split(' ')[1]
        if response_code == '403':
            print(f"Received response code 403 for submission {submission_id}. Exiting loop.")
            break

        # Extract the HTML content of the page
        page_source = driver.page_source
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'lxml')

        # Extract the code from the page
        code_element = soup.find('pre', class_='prettyprint')
        
        if code_element:
            code = code_element.get_text()

            # Write the code to a .txt file
            file_path = os.path.join(directory, f"submission_{submission_id}.txt")
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(code)
            print(f"Code found for submission {submission_id}.")
            count = 0
        else:
            print(f"No code found for submission {submission_id}.")
            time.sleep(120)
            count = count + 1
            if count > 3:
                driver.quit()
                break
        
        time.sleep(random.randint(15, 20))
        driver.quit()
    except Exception as e:
        print(f"An error occurred for submission {submission_id}: {e}")

## Creating Bag of Words model

#### Pre-processing the data

In [None]:
!pip install nltk
!pip install sctokenizer

In [14]:
import os
from sctokenizer import CppTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

model_csv

def preprocess_code(code):
    # Replace ";" with newline character
    code = code.replace(';', ';\n')
    return code

def tokenize_cpp_file(file_path):
    tokenizer = CppTokenizer()
    with open(file_path) as file:
        source = file.read()
        # Tokenize preprocessed code
        code = preprocess_code(source)
        tokens = tokenizer.tokenize(code)
        return ' '.join(token.token_value for token in tokens)  # Join tokens into a single string

def process_code_files(directory):
    corpus = []  # List to store tokenized code from all files
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):  # Process only .txt files
            file_path = os.path.join(directory, file_name)
            tokens = tokenize_cpp_file(file_path)
            corpus.append(tokens)

    # Apply TF-IDF processing
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    return vectorizer, tfidf_matrix

directory = "code_files"
vectorizer, tfidf_matrix = process_code_files(directory)

In [22]:
import os
import pandas as pd
from sctokenizer import CppTokenizer

def preprocess_code(code):
    code = code.replace(';', ';\n')
    return code

def tokenize_cpp_file(file_path):
    tokenizer = CppTokenizer()
    with open(file_path) as file:
        source = file.read()
        code = preprocess_code(source)
        tokens = tokenizer.tokenize(code)
        return ','.join(token.token_value for token in tokens)

def process_code_files(directory, output_csv):
    df_list = []

    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            submission_id = file_name.split('_')[-1].split('.')[0]

            file_path = os.path.join(directory, file_name)
            tokens = tokenize_cpp_file(file_path)

            df_list.append({'id': submission_id,
                            'tokens': tokens})

    df = pd.DataFrame(df_list)

    output_df = pd.read_csv(output_csv)
    output_df = output_df[['id', 'time_consumed_millis', 'memory_consumed_bytes']]

  
    df['id'] = df['id'].astype(str)
    output_df['id'] = output_df['id'].astype(str)


    df = pd.merge(df, output_df, on='id', how='left')

    # Save final_df to CSV
    final_df.to_csv('model_data.csv', index=False)

directory = "code_files"
output_csv = "output.csv"
process_code_files(directory, output_csv)

## Constructing the model

We're going to make 2 models : 
1. Execution time
2. Memory consumption

#### Splitting the data

In [24]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('model_data.csv')
 
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

#### Apply TF-IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = []

# Iterate over each row in the DataFrame
for index, row in train_df.iterrows():
    tokens = row['tokens']
    # Join tokens into a single string and add to the corpus
    corpus.append(' '.join(tokens.split()))

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(corpus)

Applying TF-IDF to test set

In [27]:
# Create TF-IDF matrix for the test data
test_corpus = []

# Iterate over each row in the test DataFrame
for index, row in test_df.iterrows():
    tokens = row['tokens']
    # Join tokens into a single string and add to the test_corpus
    test_corpus.append(' '.join(tokens.split()))

# Transform the test_corpus into TF-IDF matrix using the same vectorizer
test_tfidf_matrix = vectorizer.transform(test_corpus)
