<a href="https://colab.research.google.com/github/mehenika/Performance-Log-Level/blob/main/eightFeaturesExtracted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import openpyxl

# Load necessary libraries and functions
import javalang
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def extract_features(code):
    # Parse the code into an AST
    tree = javalang.parse.parse(code)

    # Extracting features
    average_log_level = calculate_average_log_level(tree)
    first_block_type = get_first_block_type(tree)
    second_block_type = get_second_block_type(tree)
    has_throw_statement = check_throw_statement(tree)
    num_variables = count_variables(tree)
    num_tokens = count_tokens(code)
    text_length = len(code)
    word_similarity = calculate_word_similarity(code)

    # Return the extracted features
    return {
        "average_log_level": average_log_level,
        "first_block_type": first_block_type,
        "second_block_type": second_block_type,
        "has_throw_statement": has_throw_statement,
        "num_variables": num_variables,
        "num_tokens": num_tokens,
        "text_length": text_length,
        "word_similarity": word_similarity
    }

def calculate_average_log_level(tree):
    log_level_count = 0
    log_statement_count = 0

    for _, node in tree.filter(javalang.tree.MethodInvocation):

        if node.qualifier == 'logger':
            log_statement_count += 1
            if node.member == 'trace':
              log_level_count += 1
            if node.member == 'debug':
              log_level_count += 2
            if node.member == 'info':
              log_level_count += 3
            if node.member == 'warn':
              log_level_count += 4
            if node.member == 'error':
              log_level_count += 5

    if log_statement_count > 0:
        average_log_level = log_level_count / log_statement_count
    else:
        average_log_level = 0.0

    return average_log_level


def get_first_block_type(tree):
    # Implementation to extract the type of the first block
    # in the code (e.g., if, for, while, etc.)
    for _, node in tree.filter(javalang.tree.BlockStatement):
        if isinstance(node, javalang.tree.IfStatement):
            return 'if'
        elif isinstance(node, javalang.tree.ForStatement):
            return 'for'
        elif isinstance(node, javalang.tree.WhileStatement):
            return 'while'
        # Add more conditions for other block types as needed

    return 'unknown'

def get_second_block_type(tree):
    # Implementation to extract the type of the second block
    # in the code (e.g., if, for, while, etc.)
    for _, node in tree.filter(javalang.tree.BlockStatement):
        if isinstance(node, javalang.tree.IfStatement):
            return 'if'
        elif isinstance(node, javalang.tree.ForStatement):
            return 'for'
        elif isinstance(node, javalang.tree.WhileStatement):
            return 'while'
        # Add more conditions for other block types as needed
    return 'unknown'

def check_throw_statement(tree):
    # Implementation to check if the code contains a throw statement
    for _, node in tree:
        if isinstance(node, javalang.tree.ThrowStatement):
            return True

    return False


def count_variables(tree):
    # Implementation to count the number of variables in the code
    variable_counter = Counter()

    for _, node in tree:
        if isinstance(node, javalang.tree.LocalVariableDeclaration):
            for variable in node.declarators:
                variable_counter[variable.name] += 1

    return len(variable_counter)


def count_tokens(code):
    # Implementation to count the number of tokens in the code
    tokens = list(javalang.tokenizer.tokenize(code))
    return len(tokens)


def calculate_word_similarity(code):
    # Tokenize the code into words
    tokens = word_tokenize(code)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Calculate the word similarity
    word_similarity = len(set(filtered_tokens)) / len(tokens)

    return word_similarity


from google.colab import drive
drive.mount('/content/drive')

# Read input Excel file
input_file = '/content/drive/MyDrive/input.xlsx'
df_input = pd.read_excel(input_file)

# Create an empty DataFrame to store the extracted features
df_features = pd.DataFrame(columns=['CodeSnippet', 'average_log_level', 'first_block_type', 'second_block_type', 'has_throw_statement', 'num_variables', 'num_tokens', 'text_length', 'word_similarity'])

# Iterate over each code snippet
for index, row in df_input.iterrows():
    code_snippet = row['CodeSnippet']  # Assuming 'CodeSnippet' is the column name containing the code snippets

    # Add the 'public class MyClass' wrapper around the code snippet
    modified_code = f"public class MyClass {{ {code_snippet} }}"

    # Extract the features from the modified code snippet
    features = extract_features(modified_code)

    # Append the features to the DataFrame
    features['CodeSnippet'] = code_snippet
    df_features = df_features.append(features, ignore_index=True)


    # Write the DataFrame with the extracted features to a new Excel file
output_file = 'output.xlsx'
df_features.to_excel(output_file, index=False, engine='openpyxl')




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(featu

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
pip install javalang

