<a href="https://colab.research.google.com/github/mayanksingh-27/Code_Copilot/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Install and Import Required Libraries

# Install pycparser for C code parsing
!pip install pycparser

# Import all necessary libraries
from pycparser import c_parser
import re
import numpy as np
import random
import pandas as pd
import joblib

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')  # To keep output clean




In [None]:
# STEP 2: Tokenization Function

def tokenize_c_code(code):
    """
    Takes a C code snippet as input and returns a list of tokens.
    """
    token_pattern = r'\w+|==|<=|>=|!=|[{}();,+*/<>=-]'
    tokens = re.findall(token_pattern, code)
    return tokens


In [None]:
# STEP 3 (Updated): Auto-generate a Large Synthetic Dataset (10,000+ autocomplete pairs)

# Templates for C code
templates = [
    "int <id> = <num> ;",
    "float <id> = <num> ;",
    "char <id> = '<char>' ;",
    "if ( <id> > <id> ) { <id> = <id> ; }",
    "while ( <id> < <num> ) { <id> ++ ; }",
    "for ( int <id> = 0 ; <id> < <num> ; <id> ++ ) { printf ( \"%d\" , <id> ) ; }",
    "do { <id> -- ; } while ( <id> > 0 ) ;",
    "int <id> ( int <id> , int <id> ) { return <id> + <id> ; }",
    "switch ( <id> ) { case 1 : break ; case 2 : break ; }",
    "int <id> [ <num> ] = { <num> , <num> , <num> , <num> , <num> } ;"
]

# Token substitution
def generate_tokens(template):
    subs = {
        '<id>': ['i', 'j', 'k', 'sum', 'count', 'temp', 'value', 'result'],
        '<num>': ['0', '1', '5', '10', '50', '100', '999'],
        '<char>': ['A', 'B', 'C', 'Z', 'X']
    }
    tokens = template.split()
    result = []
    for tok in tokens:
        if tok in subs:
            result.append(random.choice(subs[tok]))
        else:
            result.append(tok)
    return result

# Dataset generator
def generate_dataset(n=1000):   # 1000 snippets
    correct_snippets = []
    incorrect_snippets = []

    for _ in range(n):
        template = random.choice(templates)
        correct = " ".join(generate_tokens(template))
        correct_snippets.append(correct)

        # Corrupt snippet
        corrupt = correct.replace(";", "") if ";" in correct else correct + " ;"
        corrupt = corrupt.replace("{", "") if "{" in corrupt else corrupt.replace("(", "")
        incorrect_snippets.append(corrupt.strip())

    return correct_snippets, incorrect_snippets

# Create datasets
correct_data, incorrect_data = generate_dataset(1000)  # 1000 correct, 1000 incorrect

# Preview few
print(f"Generated {len(correct_data)} correct snippets.")
print(f"Generated {len(incorrect_data)} incorrect snippets.")
print("-" * 50)
print(correct_data[0])
print(incorrect_data[0])


Generated 1000 correct snippets.
Generated 1000 incorrect snippets.
--------------------------------------------------
char j = '<char>' ;
char j = '<char>'


In [None]:
# STEP 3.5: View the Generated Dataset

import pandas as pd

# Create a DataFrame for easy viewing
df = pd.DataFrame({
    "Correct_Code": correct_data,
    "Incorrect_Code": incorrect_data
})

# Display first 10 rows
df.head(100)


Unnamed: 0,Correct_Code,Incorrect_Code
0,char j = '<char>' ;,char j = '<char>'
1,for ( int sum = 0 ; count < 5 ; temp ++ ) { pr...,for ( int sum = 0 count < 5 temp ++ ) print...
2,if ( result > count ) { count = count ; },if ( result > count ) count = count }
3,if ( temp > k ) { k = sum ; },if ( temp > k ) k = sum }
4,int sum = 0 ;,int sum = 0
...,...,...
95,do { result -- ; } while ( temp > 0 ) ;,do result -- } while ( temp > 0 )
96,switch ( i ) { case 1 : break ; case 2 : break...,switch ( i ) case 1 : break case 2 : break }
97,for ( int temp = 0 ; value < 50 ; count ++ ) {...,for ( int temp = 0 value < 50 count ++ ) pr...
98,do { i -- ; } while ( j > 0 ) ;,do i -- } while ( j > 0 )


In [None]:
# STEP 4: Prepare Datasets for Autocomplete and Autocorrect Tasks

# --- AUTOCOMPLETE DATASET (correct code only) ---

def create_autocomplete_dataset(snippets):
    """
    Creates input-output pairs for autocomplete task.
    Input: sequence of tokens
    Output: next token
    """
    X_seq, y_next = [], []
    for snippet in snippets:
        tokens = tokenize_c_code(snippet)
        for i in range(1, len(tokens)):
            X_seq.append(" ".join(tokens[:i]))  # Input: partial tokens
            y_next.append(tokens[i])            # Output: next token
    return X_seq, y_next

# Create autocomplete input-output pairs
X_auto, y_auto = create_autocomplete_dataset(correct_data)

# --- AUTOCORRECT DATASET (both correct and incorrect code) ---

def create_autocorrect_dataset(correct, incorrect):
    """
    Creates input-output pairs for autocorrect task.
    Input: full code snippet
    Output: label - Correct / Incorrect
    """
    X_corr = []
    y_corr = []

    # Correct labeled samples
    for snippet in correct:
        X_corr.append(" ".join(tokenize_c_code(snippet)))
        y_corr.append("Correct")

    # Incorrect labeled samples
    for snippet in incorrect:
        X_corr.append(" ".join(tokenize_c_code(snippet)))
        y_corr.append("Incorrect")

    return X_corr, y_corr

# Create autocorrect input-output pairs
X_corr, y_corr = create_autocorrect_dataset(correct_data, incorrect_data)

# --- Dataset Dimensions ---
print(f"Autocomplete Dataset: {len(X_auto)} samples")
print(f"Autocorrect Dataset: {len(X_corr)} samples")


Autocomplete Dataset: 11519 samples
Autocorrect Dataset: 2000 samples


In [None]:
# STEP 5: Vectorization and Label Encoding

# --- AUTOCOMPLETE VECTORIZATION ---

# Use CountVectorizer to convert token sequences to vectors
vectorizer_auto = CountVectorizer(ngram_range=(1, 2))  # using unigrams + bigrams for richer features
X_auto_vec = vectorizer_auto.fit_transform(X_auto)

# Encode the next token labels (y_auto)
label_encoder_auto = LabelEncoder()
y_auto_enc = label_encoder_auto.fit_transform(y_auto)


# --- AUTOCORRECT VECTORIZATION ---

# Similarly, vectorize full snippets for autocorrect
vectorizer_corr = CountVectorizer(ngram_range=(1, 2))
X_corr_vec = vectorizer_corr.fit_transform(X_corr)

# Encode the labels (Correct/Incorrect)
label_encoder_corr = LabelEncoder()
y_corr_enc = label_encoder_corr.fit_transform(y_corr)


# --- Print Shapes to Check ---

print(f" Autocomplete input shape: {X_auto_vec.shape}")
print(f" Autocomplete output shape: {y_auto_enc.shape}")
print("-" * 50)
print(f" Autocorrect input shape: {X_corr_vec.shape}")
print(f" Autocorrect output shape: {y_corr_enc.shape}")


 Autocomplete input shape: (11519, 202)
 Autocomplete output shape: (11519,)
--------------------------------------------------
 Autocorrect input shape: (2000, 202)
 Autocorrect output shape: (2000,)


In [None]:
# STEP 6: Train the SVM Models (Autocomplete and Autocorrect)

from sklearn import svm
from sklearn.model_selection import train_test_split

# --- AUTOCOMPLETE MODEL ---

# Train-test split for autocomplete
X_train_auto, X_test_auto, y_train_auto, y_test_auto = train_test_split(
    X_auto_vec, y_auto_enc, test_size=0.25, random_state=42
)

# Define SVM model (RBF kernel)
model_auto = svm.SVC(kernel='rbf', probability=True)
model_auto.fit(X_train_auto, y_train_auto)

print("Autocomplete SVM model trained successfully!")


# --- AUTOCORRECT MODEL ---

# Train-test split for autocorrect
X_train_corr, X_test_corr, y_train_corr, y_test_corr = train_test_split(
    X_corr_vec, y_corr_enc, test_size=0.25, random_state=42
)

# Define SVM model (RBF kernel)
model_corr = svm.SVC(kernel='rbf')
model_corr.fit(X_train_corr, y_train_corr)

print("Autocorrect SVM model trained successfully!")


Autocomplete SVM model trained successfully!
Autocorrect SVM model trained successfully!


In [None]:
# STEP 7: Evaluate the Models

from sklearn.metrics import accuracy_score, classification_report

# --- AUTOCOMPLETE MODEL EVALUATION ---

# Predict on test set
y_pred_auto = model_auto.predict(X_test_auto)

# Evaluate
print("Autocomplete Model Evaluation")
print(f"Accuracy: {accuracy_score(y_test_auto, y_pred_auto):.4f}")
print("-" * 40)

# Solve the labels mismatch problem
from numpy import unique

labels_used_auto = unique(y_test_auto)
target_names_auto = label_encoder_auto.inverse_transform(labels_used_auto)

print(classification_report(
    y_test_auto, y_pred_auto,
    labels=labels_used_auto,
    target_names=target_names_auto
))

print("=" * 80)

# --- AUTOCORRECT MODEL EVALUATION ---

# Predict on test set
y_pred_corr = model_corr.predict(X_test_corr)

# Evaluate
print("Autocorrect Model Evaluation")
print(f"Accuracy: {accuracy_score(y_test_corr, y_pred_corr):.4f}")
print("-" * 40)

# Similarly handle label mismatch
labels_used_corr = unique(y_test_corr)
target_names_corr = label_encoder_corr.inverse_transform(labels_used_corr)

print(classification_report(
    y_test_corr, y_pred_corr,
    labels=labels_used_corr,
    target_names=target_names_corr
))


Autocomplete Model Evaluation
Accuracy: 0.2222
----------------------------------------
              precision    recall  f1-score   support

           (       0.28      0.67      0.39       175
           )       0.11      0.07      0.08       178
           +       0.25      0.56      0.34       116
           ,       0.27      0.68      0.39       149
           -       0.33      0.56      0.41        52
           0       0.00      0.00      0.00        87
           1       0.35      0.10      0.16        70
          10       0.00      0.00      0.00        29
         100       0.00      0.00      0.00        40
           2       0.32      0.57      0.41        14
           5       0.00      0.00      0.00        37
          50       0.00      0.00      0.00        23
         999       0.00      0.00      0.00        28
           ;       0.20      0.30      0.24       359
           <       0.21      0.15      0.18        72
           =       0.20      0.46      0.28    

In [None]:
# STEP 8: Top-k Token Prediction for Autocomplete

import numpy as np

def predict_top_k_tokens(input_sequence, model, vectorizer, label_encoder, k=5):
    """
    Given a partial token sequence, predict top-k likely next tokens.
    """
    # Tokenize and vectorize input
    tokens = tokenize_c_code(input_sequence)
    input_text = " ".join(tokens)
    input_vec = vectorizer.transform([input_text])

    # Get probability distribution over all classes
    probas = model.predict_proba(input_vec)[0]

    # Get top-k token indices
    top_k_indices = np.argsort(probas)[-k:][::-1]

    # Decode labels
    top_k_tokens = label_encoder.inverse_transform(top_k_indices)
    top_k_probs = probas[top_k_indices]

    # Prepare result as list of (token, probability)
    results = [(token, round(float(prob), 4)) for token, prob in zip(top_k_tokens, top_k_probs)]

    return results

# Example usage
partial_code = "for ( int i ="
top_k_predictions = predict_top_k_tokens(
    partial_code,
    model_auto,
    vectorizer_auto,
    label_encoder_auto,
    k=5
)

# Display top-k predictions
print(f"Top-5 token predictions for: '{partial_code}'")
for idx, (token, score) in enumerate(top_k_predictions, 1):
    print(f"{idx}. Token: '{token}', Confidence: {score}")


Top-5 token predictions for: 'for ( int i ='
1. Token: ';', Confidence: 0.3032
2. Token: '0', Confidence: 0.1513
3. Token: '=', Confidence: 0.0895
4. Token: 'i', Confidence: 0.042
5. Token: 'j', Confidence: 0.0408


In [None]:
# STEP 9: Final Functions - Autocomplete + Autocorrect Unified

# --- Unified function for Autocomplete Top-k Prediction ---

def autocomplete_top_k(input_code, model, vectorizer, label_encoder, k=5):
    """
    Given partial C code, suggest top-k next tokens.
    """
    tokens = tokenize_c_code(input_code)
    if not tokens:
        return []

    input_text = " ".join(tokens)
    input_vec = vectorizer.transform([input_text])

    probas = model.predict_proba(input_vec)[0]
    top_k_indices = np.argsort(probas)[-k:][::-1]
    top_k_tokens = label_encoder.inverse_transform(top_k_indices)
    top_k_probs = probas[top_k_indices]

    results = [(token, round(float(prob), 4)) for token, prob in zip(top_k_tokens, top_k_probs)]
    return results


# --- Unified function for Autocorrect Classification ---

def autocorrect_classification(input_code, model, vectorizer, label_encoder):
    """
    Given a full C code snippet, predict if it's Correct or Incorrect.
    """
    tokens = tokenize_c_code(input_code)
    if not tokens:
        return "Unknown"

    input_text = " ".join(tokens)
    input_vec = vectorizer.transform([input_text])

    pred = model.predict(input_vec)[0]
    label = label_encoder.inverse_transform([pred])[0]

    return label


# --- Final Master Function ---

def predict_code_suggestion(input_code,
                             model_auto, vectorizer_auto, label_encoder_auto,
                             model_corr, vectorizer_corr, label_encoder_corr,
                             k=5):
    """
    Master function: Takes input code and outputs:
    - Top-k autocomplete suggestions
    - Autocorrect classification
    """
    print("=" * 80)
    print(f"Input Code: {input_code}")
    print("-" * 80)

    # Autocomplete Top-k Suggestions
    print(f"Top-{k} Autocomplete Suggestions:")
    top_k = autocomplete_top_k(input_code, model_auto, vectorizer_auto, label_encoder_auto, k)
    for idx, (token, score) in enumerate(top_k, 1):
        print(f"{idx}. Token: '{token}' (Confidence: {score})")

    print("-" * 80)

    # Autocorrect Classification
    correctness = autocorrect_classification(input_code, model_corr, vectorizer_corr, label_encoder_corr)
    print(f"Code Classification: {correctness}")

    print("=" * 80)


In [None]:
# Try your system!

# Example 1
predict_code_suggestion(
    "for ( int i =",
    model_auto, vectorizer_auto, label_encoder_auto,
    model_corr, vectorizer_corr, label_encoder_corr,
    k=5
)

# Example 2
predict_code_suggestion(
    "int main ( ) { return 0 }",   # Missing semicolon - should be Incorrect
    model_auto, vectorizer_auto, label_encoder_auto,
    model_corr, vectorizer_corr, label_encoder_corr,
    k=5
)

# Example 3
predict_code_suggestion(
    "if ( i > j ) { i = j ; }",   # Correct code
    model_auto, vectorizer_auto, label_encoder_auto,
    model_corr, vectorizer_corr, label_encoder_corr,
    k=5
)


Input Code: for ( int i =
--------------------------------------------------------------------------------
Top-5 Autocomplete Suggestions:
1. Token: ';' (Confidence: 0.3032)
2. Token: '0' (Confidence: 0.1513)
3. Token: '=' (Confidence: 0.0895)
4. Token: 'i' (Confidence: 0.042)
5. Token: 'j' (Confidence: 0.0408)
--------------------------------------------------------------------------------
Code Classification: Incorrect
Input Code: int main ( ) { return 0 }
--------------------------------------------------------------------------------
Top-5 Autocomplete Suggestions:
1. Token: '=' (Confidence: 0.1171)
2. Token: 'value' (Confidence: 0.0905)
3. Token: 'i' (Confidence: 0.0872)
4. Token: 'sum' (Confidence: 0.0837)
5. Token: 'j' (Confidence: 0.0816)
--------------------------------------------------------------------------------
Code Classification: Incorrect
Input Code: if ( i > j ) { i = j ; }
--------------------------------------------------------------------------------
Top-5 Autocom