<a href="https://colab.research.google.com/github/mehenika/Log_level_identification/blob/main/featureExtractionAll.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install javalang

Collecting javalang
  Downloading javalang-0.13.0-py3-none-any.whl (22 kB)
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [18]:
from javalang.tree import WhileStatement
import pandas as pd
import openpyxl
import javalang
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



def extract_features(code, log_level, log_message):
    # Parse the code into an AST
    tree = javalang.parse.parse(code)

    # Extracting features
    var_declar = count_variable_declarations(code)
    boolean_var = count_boolean_literals(code)
    if_stm = count_if_statements(code)
    method_invo = count_method_invocations(code)
    average_log_level = calculate_average_log_level(tree, log_level, log_message)
    has_throw_statement = check_throw_statement(tree)
    num_try = count_try_clauses(tree)
    num_catch = count_catch_clauses(tree)
    num_tokens = count_tokens(code)
    text_length = len(code)
    word_similarity = calculate_word_similarity(code, log_message)
    num_loop_iterations = count_loop_iterations(tree)
    num_io_operations = count_io_operations(modified_code)

    # Return the extracted features
    return {

        "var_declar" : var_declar,
        "boolean_var" : boolean_var,
        "if_stm": if_stm,
        "method_invo": method_invo,
        "average_log_level": average_log_level,
        "has_throw_statement": has_throw_statement,
        "num_try_clauses": num_try,
        "num_catch_clauses": num_catch,
        "num_tokens": num_tokens,
        "text_length": text_length,
        "word_similarity": word_similarity,
        "num_loop_iterations": num_loop_iterations,
        "num_io_operations": num_io_operations,
        "log_level": log_level,  # Store the log level
        "log_message": log_message # Store the log message

    }

def count_variable_declarations(code):
    try:
      tree = javalang.parse.parse(code)
      var_declaration_count = 0
    except:
      return 0

    for _, node in tree:
        if isinstance(node, javalang.tree.LocalVariableDeclaration):
            var_declaration_count += 1

    return var_declaration_count

def count_boolean_literals(code):
    try:
      tree = javalang.parse.parse(code)
      boolean_literal_count = 0
    except:
      return 0

    for _, node in tree:
        if isinstance(node, javalang.tree.VariableDeclarator) and isinstance(node.initializer, javalang.tree.Literal) and isinstance(node.initializer.value, bool):
            boolean_literal_count += 1

    return boolean_literal_count

def count_if_statements(code):
    try:
      tree = javalang.parse.parse(code)
      if_statement_count = 0
    except:
      return 0

    for _, node in tree:
        if isinstance(node, javalang.tree.IfStatement):
            if_statement_count += 1

    return if_statement_count

def count_method_invocations(code):
    try:
      tree = javalang.parse.parse(code)
      method_invocation_count = 0
    except:
      return 0

    for _, node in tree:
        if isinstance(node, javalang.tree.MethodInvocation):
            method_invocation_count += 1

    return method_invocation_count


def calculate_average_log_level(tree, log_level, log_message):
    log_level_sum = 0
    log_statement_count = 0

    for _, node in tree.filter(javalang.tree.MethodInvocation):
        if node.qualifier == 'logger':
            log_statement_count += 1
            if node.member.lower() == log_level.lower():  # Compare log levels case-insensitively
                log_level_sum += log_level_to_value(log_level)

    if log_statement_count > 0:
        average_log_level = log_level_sum / log_statement_count
    else:
        average_log_level = 0.0

    return average_log_level

def log_level_to_value(log_level):
    # Map log levels to numeric values (adjust as needed)
    log_level_map = {
        'trace': 1,
        'debug': 2,
        'info': 3,
        'warn': 4,
        'error': 5
    }
    return log_level_map.get(log_level.lower(), 0)



def check_throw_statement(tree):
    for _, node in tree:
        if isinstance(node, javalang.tree.ThrowStatement):
            return 1

    return 0


def count_try_clauses(tree):
    try_count = 0

    for _, node in tree:
        if isinstance(node, javalang.tree.TryStatement):
            try_count += 1

    return try_count

def count_catch_clauses(tree):
    catch_count = 0

    for _, node in tree:
        if isinstance(node, javalang.tree.CatchClause):
            catch_count += 1

    return catch_count




def count_tokens(code):
    tokens = list(javalang.tokenizer.tokenize(code))
    return len(tokens)


def calculate_word_similarity(code, log_message):
    tokens = word_tokenize(code)

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    word_similarity = len(set(filtered_tokens)) / len(tokens)

    return word_similarity



    # PERFORMANCE FEATURES

def count_loop_iterations(tree):
    loop_iterations = 0

    # Iterate through the tree and identify loop constructs
    for _, node in tree:
        if isinstance(node, javalang.tree.WhileStatement):
            loop_iterations += 1
        elif isinstance(node, javalang.tree.ForStatement):
            loop_iterations += 1

    return loop_iterations


def count_io_operations(code):
    io_keywords = ["read", "write", "input", "output", "file", "stream", "socket", "database"]

    # Split the code into words and count occurrences of I/O-related keywords
    code_words = code.split()
    io_operation_count = sum(1 for word in code_words if any(keyword in word.lower() for keyword in io_keywords))

    return io_operation_count



from google.colab import drive
drive.mount('/content/drive')

# Read input Excel file

input_file = '/content/drive/MyDrive/Dataset/zookeeper.xlsx'
df_input = pd.read_excel(input_file)

# Create an empty DataFrame to store the extracted features
df_features = pd.DataFrame(columns=['CodeSnippet', 'var_declar','boolean_var','if_stm','method_invo','average_log_level', 'has_throw_statement','num_tokens', 'text_length', 'word_similarity'])

# Iterate over each code snippet and log data
for index, row in df_input.iterrows():
    # print(row.keys)
    code_snippet = row['CodeSnippet']  # Assuming 'CodeSnippet' is the column name containing the code snippets
    log_level = row['LogLevel']  # Assuming 'LogLevel' is the column name containing the log level
    log_message = row['LogMessage']  # Assuming 'LogMessage' is the column name containing the log message

    # Add the 'public class MyClass' wrapper around the code snippet
    modified_code = f"public class MyClass {{ {code_snippet} }}"

    # Extract the features from the modified code snippet and log data
    try:
      features = extract_features(modified_code, log_level, log_message)
    except:
      continue

    # Append the features to the DataFrame
    features['CodeSnippet'] = code_snippet
    df_features = df_features.append(features, ignore_index=True)

# Write the DataFrame with the extracted features to a new Excel file
output_file = 'zookeeper_ot.xlsx'
df_features.to_excel(output_file, index=False, engine='openpyxl')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(features, ignore_index=True)
  df_features = df_features.append(featu