# **Data Loading and Processing**

In [1]:
# Importing Important Libraries

import pandas as pd

In [2]:
# Installing datasets library for loading big-clone-bench dataset

!pip install datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
# Loading the Big-Clone-Bench dataset

from datasets import load_dataset
dataset = load_dataset("code_x_glue_cc_clone_detection_big_clone_bench")

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/108M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/87.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/63.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/901028 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/415416 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/415416 [00:00<?, ? examples/s]

In [4]:
# Printing the keys in the dataset

print(dataset.keys())

dict_keys(['train', 'validation', 'test'])


In [5]:
# Copying the model training data from the dataset into a dataframe

df = pd.DataFrame(dataset['train'])

In [6]:
df = df.head(25000)

In [7]:
# Printing the dataframe

df.head()

Unnamed: 0,id,id1,id2,func1,func2,label
0,0,13988825,8660836,private void setNodekeyInJsonResponse(Stri...,"public void transform(String style, String...",False
1,1,80378,18548122,public static void test(String args[]) {\n...,private static String loadUrlToString(Stri...,True
2,2,21354223,7421563,public String kodetu(String testusoila) {\...,private StringBuffer encoder(String arg) {...,True
3,3,15826299,19728871,public static void printResponseHeaders(St...,public static String getEncodedPassword(St...,False
4,4,9938081,11517213,public void load(String fileName) {\n ...,private static void copyFile(File sourceFi...,False


In [8]:
# Dropping the unnecessary columns

column = ['id', 'id1', 'id2']

df = df.drop(column, axis = 1)

In [None]:
# Converting the boolean labels into 0 and 1

df['label'] = df['label'].astype(int)

In [None]:
# Printing the dataframe after deleting the columns

df.head()

Unnamed: 0,func1,func2,label
0,private void setNodekeyInJsonResponse(Stri...,"public void transform(String style, String...",0
1,public static void test(String args[]) {\n...,private static String loadUrlToString(Stri...,1
2,public String kodetu(String testusoila) {\...,private StringBuffer encoder(String arg) {...,1
3,public static void printResponseHeaders(St...,public static String getEncodedPassword(St...,0
4,public void load(String fileName) {\n ...,private static void copyFile(File sourceFi...,0


In [None]:
# Printing the size of the dataframe

df.shape

(50000, 3)

In [None]:
# Checking if there are any null values [if null values exist, further processing is required]

df.isnull().sum().sort_values(ascending=False)

func1    0
func2    0
label    0
dtype: int64

In [None]:
# SVM

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

code_snippets = (df['func1'] + ' ' + df['func2']).tolist()

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(code_snippets)

labels = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42
)

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Tokens

import pandas as pd

vocab = vectorizer.get_feature_names_out()
print("Vocabulary (Features):", vocab)

X_df = pd.DataFrame(X.toarray(), columns=vocab)

print("\nMatrix of Token Counts:")
print(X_df)

Vocabulary (Features): ['00' '000' '0000' ... 'при' 'создании' 'удается']

Matrix of Token Counts:
     00  000  0000  000000  0000000000  0000e00  001  01  03  03d  ...  zos  \
0     0    0     0       0           0        0    0   0   0    0  ...    0   
1     0    2     0       0           0        0    0   0   0    0  ...    0   
2     0    0     0       0           0        0    0   0   0    0  ...    0   
3     0    0     0       0           0        0    0   1   0    4  ...    0   
4     0    0     0       0           0        0    0   0   0    0  ...    0   
..   ..  ...   ...     ...         ...      ...  ...  ..  ..  ...  ...  ...   
117   0    0     0       0           0        0    0   0   0    0  ...    0   
118   0    0     0       0           0        0    0   0   0    0  ...    0   
119   0    0     0       0           0        0    0   0   0    0  ...    0   
120   0    0     0       0           1        0    0   0   0    0  ...    0   
121   0    0     0       0      