#### Doing this project has been amazing!

I searched near and far for relevant datasets to predict code readability of code snippets, but could find none. I even turned to fellow Kagglers for help, but it was a challenge to find labelled data for training my model.

As a result, *like a newbie*, I relied on research papers. I gathered valuable metrics that truly contribute to code readability and created a formula for a relevant readability score.

**It was like a dream come true when my results matched closely with those of PyLint.**

Hope you like the datasets and the notebook! 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/code-snippets-insights-and-readability/data_cpp.csv
/kaggle/input/code-snippets-insights-and-readability/data_python.csv


In [2]:
# df['readability'] = 10 - (0.1*df['identifiers'] + 
#                                0.09*df['line_length'] + 
#                                0.06*df['indents'] -
#                                0.04*df['comments'] +
#                                0.06*df['cyclomatic_complexity'])

# #https://web.eecs.umich.edu/~weimerw/p/weimer-tse2010-readability-preprint.pdf

## Building a Readability Model for C++ Codes:

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

In [4]:
df = pd.read_csv("/kaggle/input/code-snippets-insights-and-readability/data_cpp.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Answer,num_of_lines,code_length,comments,cyclomatic_complexity,num_of_indents,loop_count,line_length,identifiers,readability
0,0,class Solution {\n public:\n vector<int> twoS...,8,347,0,1,4,2,43.375,8,4.99625
1,1,class Solution {\n public:\n ListNode* addTwo...,12,479,0,3,5,3,39.916667,2,5.7275
2,2,class Solution {\n public:\n int lengthOfLong...,9,303,0,2,3,2,33.666667,6,6.07
3,3,class Solution {\n public:\n double findMedia...,16,1046,0,4,3,4,65.375,22,1.49625
4,4,class Solution {\n public:\n string longestPa...,14,916,3,4,9,7,65.428571,14,2.051429


#### Build necessary functions:

In [6]:
def count_lines(text):
    # Count the number of lines in the text
    return str(text).count(';')

In [7]:
import re

def count_comments(code):
    # Count the number of C++ comments using a regular expression
    cpp_comments = re.findall(r'//.*|/\*[\s\S]*?\*/', code)
    return len(cpp_comments)

In [8]:
def cyclomatic_complexity(code):
    """Calculates the cyclomatic complexity of a C++ code snippet."""
    try:
        decision_points = 0
        exits = 0

        for word in re.findall(r"\b\w+\b", code):  # Extract words using a regular expression
            if word in ("if", "else", "while", "for", "switch", "case", "try", "catch"):
                decision_points += 1
            elif word in ("return", "break", "continue", "goto", "throw"):
                exits += 1

        complexity = decision_points + 1 - exits
        return complexity
    except re.error as e:
        print(f"Error processing code: {e}")
        return None

In [9]:
def count_indents(code):
    num_indents = code.count('{')
    return num_indents

In [10]:
def count_loops(code):
    loop_keywords = ['for', 'while', 'if']
    count = sum(code.lower().count(keyword) for keyword in loop_keywords)
    return count

In [11]:
def count_identifiers(code):
    """Calculates the cyclomatic complexity of a C++ code snippet."""
    try:
        count=0

        for word in re.findall(r"\b\w+\b", code):  # Extract words using a regular expression
            if word in ("int", "float", "double", "string", "char", "const", "bool", "void", "struct", "class", "namespace"):
                count +=1
        return count
    except re.error as e:
        print(f"Error processing code: {e}")
        return None

#### Build our prediction:

In [12]:
X = df[['num_of_lines', 'code_length', 'comments', 'cyclomatic_complexity', 'num_of_indents', 'loop_count', 'identifiers']]
y = df['readability']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)

In [13]:
model = make_pipeline(RandomForestRegressor(n_estimators=120, random_state=0))
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 0.2472072290225429


In [15]:
user_input = """ListNode* deleteDuplicates(ListNode* head) {
    if(!head) return head;
    ListNode *t = head, *p = head->next;
    int pre = head->val;
    while(p) {
        if(pre != p->val) {
            t->next = p;
            pre = p->val;
            t = t->next;
        }
        p = p->next;
    }
    t->next = NULL;
    return head;
}"""

lines = len(user_input.split(';'))
length = len(user_input)
comments = count_comments(user_input)
loops = count_loops(user_input)
identifiers = count_identifiers(user_input)

user_score = model.predict([[lines, length, 
                             comments, cyclomatic_complexity(user_input), 
                             count_indents(user_input), loops,
                             identifiers]])
print(f'Predicted Readability Score for User Input: {user_score[0]:.2f}')

print("Details about your code:")
print(f"Number of lines:{lines}")
print(f"Number of characters:{length}")
print(f"Number of comments:{comments}")
print(f"Number of loops:{loops}")
print(f"Number of identifiers:{identifiers}")


Predicted Readability Score for User Input: 6.13
Details about your code:
Number of lines:10
Number of characters:332
Number of comments:0
Number of loops:3
Number of identifiers:1


## Building a Readability Model for Python Codes:

In [16]:
df1= pd.read_csv("/kaggle/input/code-snippets-insights-and-readability/data_python.csv")

In [17]:
import re

def count_comments(code):
    # Count the number of comments using a regular expression
    comments = re.findall(r'#.*|(\'\'\'[\s\S]*?\'\'\'|\"\"\"[\s\S]*?\"\"\`)|```[\s\S]*?```', code)
    return len(comments)

df1['comments'] = df1['python_solutions'].apply(count_comments)

In [18]:
def cyclomatic_complexity(code):
    decision_points = len(re.findall(r"(if|elif|while|for)\s+.*:", code, re.IGNORECASE))
    exits = len(re.findall(r"(return|break|continue)\b", code, re.IGNORECASE))
    complexity = decision_points + 1 - exits
    return complexity

df1['cyclomatic_complexity'] = df1['python_solutions'].apply(lambda x: cyclomatic_complexity(x))

In [19]:
import math
def count_indents(code):
    lines = code.split('\n')
    num_indents = 0
    for line in lines:
        num_indents += line.count('    ')  # Assuming each indent is represented by four spaces
    return num_indents

def calculate_rounded_ratio(row):
    return math.ceil(row['num_of_indents'] / row['num_of_lines'])


In [20]:
def count_loops(code):
    loop_keywords = ['for', 'while', 'if']
    count = sum(code.lower().count(keyword) for keyword in loop_keywords)
    return count

In [21]:
def count_identifiers(code):
    identifiers = [':', '=', '==', '<', '>', ',']
    count = sum(code.lower().count(keyword) for keyword in identifiers)
    return count


In [22]:
X = df1[['num_of_lines', 'code_length', 'comments', 'cyclomatic_complexity', 'indents', 'loop_count', 'identifiers']]
y = df1['readability']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)

In [23]:
model = make_pipeline(RandomForestRegressor(n_estimators=50, random_state=0))
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 0.29876820914776364


In [25]:
user_input = """class Solution:
    def twoSum(self, nums: List[int], target: int) -> List[int]:
        for i in range(len(nums)):
            for j in range(i + 1, len(nums)):
                if (i != j and nums[i] + nums[j] == target):
                    return [i, j]
        return []"""

user_score = model.predict([[len(user_input.split('\n')), len(user_input), 
                             count_comments(user_input), cyclomatic_complexity(user_input), 
                             count_indents(user_input), count_loops(user_input),
                             count_identifiers(user_input)]])
print(f'Predicted Readability Score for User Input: {user_score[0]:.2f}')

Predicted Readability Score for User Input: 4.36
