In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [3]:
# Load the dataset
file_path = '/work/dataset_full_.csv'
data = pd.read_csv(file_path)


In [5]:
# Convert stringified lists in 'java_tokens' and 'python_tokens' to actual lists of numbers
data['python_tokens'] = data['python_tokens'].apply(eval)
data['java_tokens'] = data['java_tokens'].apply(eval)


In [7]:
# Define a fixed sequence length
fixed_length = 512  # Adjust this value based on your dataset

# Pad or truncate each feature using built-in NumPy methods
java_tokens_padded = np.array([
    np.pad(seq, (0, max(0, fixed_length - len(seq))), mode='constant')[:fixed_length]
    for seq in data['java_tokens']
])

python_tokens_padded = np.array([
    np.pad(seq, (0, max(0, fixed_length - len(seq))), mode='constant')[:fixed_length]
    for seq in data['python_tokens']
])


In [9]:
# Calculate the Euclidean norm between Java and Python token sequences
X = np.linalg.norm(java_tokens_padded - python_tokens_padded, axis=1).reshape(-1, 1)

# Define the target variable (Is_Equal column: 0 or 1)
y = data['Is_Equal'].values


In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Initialize the KNN classifier
k = 3
knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='uniform')

# Train the KNN classifier
knn.fit(X_train, y_train)


In [15]:
# Make predictions on the training set and testing set
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)


In [17]:
# Calculate accuracy for training and testing sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the accuracy scores
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")


Training Accuracy: 74.28%
Testing Accuracy: 64.58%


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2fe094d0-9fb3-4318-be9c-48cc4ca88a2c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>