In [None]:
import sys
import os
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import warnings

warnings.filterwarnings('ignore')

In [None]:

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU: {torch.cuda.get_device_name()}")
else:
    device = 'cpu'

In [None]:
class Code2vec(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(Code2vec, self).__init__()


        self.embed_layer = nn.Embedding(vocab_size, 512, padding_idx=2)

        self.final_layer = nn.Linear(2048, embedding_size)

        self.layers = nn.Sequential(

            nn.Conv1d(512, 256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(256, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(128, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(64, 16, kernel_size=3, padding=1),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Flatten(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
        )

        self.residual_layer = nn.Sequential(
            nn.MaxPool1d(64),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(32768, 2048),
            nn.ReLU()
        )

    def forward(self, x):

        x = self.embed_layer(x)
        x = x.permute(0, 2, 1) # 1, 256, 4096

        residual = x


        x = self.layers(x) + self.residual_layer(residual)

        x = self.final_layer(x)
        return x

In [None]:

model = torch.load('model.pth', weights_only=False, map_location=device)
model.eval()

vocab = torch.load('train/vocab.pt')

In [None]:
p1 = torch.load('test/x_test_ref.pt').to(device)
p2 = torch.load('test/x_test_test.pt').to(device)

y_test = torch.load('test/y_test.pt').to(device)

In [None]:
p1.shape, p2.shape, y_test.shape, len(vocab)

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
from tqdm.auto import tqdm

# 데이터 설정
reference_data = p1
test_data = p2
labels = y_test

threshold = 0.7

def improved_norm(x):
    # numpy로 변환
    x_np = x.cpu().numpy()
    # 각각 L2 정규화
    normalized = []
    for emb in x_np:
        norm = np.linalg.norm(emb)
        if norm > 0:
            normalized.append(emb / norm)
        else:
            normalized.append(emb)
    # 평균 후 다시 정규화
    avg = np.mean(normalized, axis=0)
    avg_norm = np.linalg.norm(avg)
    if avg_norm > 0:
        avg /= avg_norm
    return torch.tensor(avg).unsqueeze(0).to(device)

# 여러 집계 방법 테스트
methods = {
    # 'mean': lambda x: torch.mean(x, dim=0, keepdim=True),
    'improved_norm': improved_norm
}

results = {}

for method_name, method_func in methods.items():
    predictions = []
    similarities = []

    with torch.no_grad():
        for i in tqdm(range(len(reference_data)), desc=f'{method_name}...'):
            # 레퍼런스 임베딩
            ref_embed = model(reference_data[i])
            ref_aggregated = method_func(ref_embed)

            # 테스트 임베딩
            test_embed = model(test_data[i].unsqueeze(0))

            # 코사인 유사도
            sim = F.cosine_similarity(ref_aggregated, test_embed).item()
            similarities.append(sim)
            predictions.append(1 if sim >= threshold else -1)

    labels_cpu = labels.cpu().numpy()
    predictions_cpu = np.array(predictions)

    accuracy = accuracy_score(labels_cpu, predictions_cpu)
    results[method_name] = {'acc': accuracy, 'sims': similarities, 'preds': predictions}
    print(f"{method_name}: {accuracy:.4f}")


In [None]:

# 가장 좋은 방법으로 시각화
best_method = max(results.keys(), key=lambda k: results[k]['acc'])
best_sims = results[best_method]['sims']
best_preds = results[best_method]['preds']
print(f"\n최고 성능: {best_method} ({results[best_method]['acc']:.4f})")

# 시각화 - 3x2 레이아웃으로 확장
fig, axes = plt.subplots(3, 2, figsize=(15, 15))

# 방법별 정확도 비교
methods_list = list(results.keys())
accs_list = [results[m]['acc'] for m in methods_list]
axes[0,0].bar(methods_list, accs_list)
axes[0,0].set_title('Accuracy by Aggregation Method')
axes[0,0].tick_params(axis='x', rotation=45)

# 최고 성능 방법의 유사도 분포
axes[0,1].hist(best_sims, bins=30, alpha=0.7)
axes[0,1].axvline(threshold, color='red', linestyle='--')
axes[0,1].set_title(f'Best Method: {best_method}')

# 레이블별 분포 (최고 성능 방법)
labels_cpu = labels.cpu().numpy()
same = [s for s, l in zip(best_sims, labels_cpu) if l == 1]
diff = [s for s, l in zip(best_sims, labels_cpu) if l == -1]
axes[1,0].hist(same, alpha=0.7, label='Same (1)', color='green', bins=30)
axes[1,0].hist(diff, alpha=0.7, label='Diff (-1)', color='red', bins=30)
axes[1,0].axvline(threshold, color='black', linestyle='--')
axes[1,0].legend()
axes[1,0].set_title('Distribution by Label')

# 혼동행렬
cm = confusion_matrix(labels_cpu, best_preds)
sns.heatmap(cm, annot=True, fmt='d', ax=axes[1,1])
axes[1,1].set_title('Confusion Matrix')

# 레이블별 코사인 유사도 분포 (별도 플롯)
axes[2,0].hist(same, bins=30, alpha=0.7, label='Label 1 (Same Author)', color='blue', density=True)
axes[2,0].hist(diff, bins=30, alpha=0.7, label='Label -1 (Different Author)', color='orange', density=True)
axes[2,0].axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold: {threshold}')
axes[2,0].set_title('Cosine Similarity Distribution by Label (Normalized)')
axes[2,0].set_xlabel('Cosine Similarity')
axes[2,0].set_ylabel('Density')
axes[2,0].legend()
axes[2,0].grid(True, alpha=0.3)

# 레이블별 통계 정보
same_mean = np.mean(same)
same_std = np.std(same)
diff_mean = np.mean(diff)
diff_std = np.std(diff)

stats_text = f"""
Label 1 (Same Author):
  Mean: {same_mean:.4f}
  Std: {same_std:.4f}
  Count: {len(same)}

Label -1 (Different Author):
  Mean: {diff_mean:.4f}
  Std: {diff_std:.4f}
  Count: {len(diff)}

Threshold: {threshold}
Separation: {abs(same_mean - diff_mean):.4f}
"""

axes[2,1].text(0.1, 0.5, stats_text, transform=axes[2,1].transAxes,
               fontsize=12, verticalalignment='center', fontfamily='monospace')
axes[2,1].set_title('Statistics Summary')
axes[2,1].axis('off')

plt.tight_layout()
plt.show()

# 추가 분석: 레이블별 유사도 박스플롯
plt.figure(figsize=(10, 6))
data_for_boxplot = [same, diff]
labels_for_boxplot = ['Same Author (1)', 'Different Author (-1)']

plt.boxplot(data_for_boxplot, labels=labels_for_boxplot)
plt.axhline(y=threshold, color='red', linestyle='--', label=f'Threshold: {threshold}')
plt.ylabel('Cosine Similarity')
plt.title('Cosine Similarity Distribution by Label (Box Plot)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("\n=== 레이블별 유사도 통계 ===")
print(f"Same Author (Label 1): Mean={same_mean:.4f}, Std={same_std:.4f}, Count={len(same)}")
print(f"Different Author (Label -1): Mean={diff_mean:.4f}, Std={diff_std:.4f}, Count={len(diff)}")
print(f"Mean Separation: {abs(same_mean - diff_mean):.4f}")
