In [None]:
!pip install umap-learn

In [10]:
import sys
import os
import numpy as np
import yaml as yl
import matplotlib.pyplot as plt
import argparse
import glob
import pandas as pd

from typing import List, Tuple

from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
from sklearn import manifold
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [None]:
def get_model(model_name: str, **kwargs):
    LLE = partial(manifold.LocallyLinearEmbedding)
    model_name = model_name.lower()
    if model_name == "umap":
        return UMAP(**kwargs)
    elif model_name == "lle":
        return LLE(method='standard', **kwargs)
    elif model_name =="ltsa":
        return LLE(method='ltsa', **kwargs) 
    elif model_name == "hlle":
        return LLE(method='hessian', **kwargs)
    elif model_name == "mlle":
        return LLE(method='modified', **kwargs)
    elif model_name== "isomap":
        return manifold.Isomap(**kwargs)
    elif model_name == "mds":
        return manifold.MDS(**kwargs)
    elif model_name == "se":
        return manifold.SpectralEmbedding(**kwargs)
    elif model_name == "t-sne":
        return manifold.TSNE(**kwargs)
    raise ValueError(f"Invalid method {model_name}")

In [11]:
def perform_manifold(model, data: np.ndarray, targets: np.ndarray, n_components: int = 2, scale: bool = True) -> Tuple[pd.DataFrame, float, float]:
    # Scale
    if scale:
        data = StandardScaler().fit_transform(data)
    
    # Apply model on data
    x = model.fit_transform(data)
    principal_df = pd.DataFrame(data=x, columns=[f"PC-{i+1}" for i in range(n_components)])
    # Append targets values
    principal_df = pd.concat([principal_df, targets], axis=1)
    
    return principal_df

In [12]:
def plot_2_components(df: pd.DataFrame, n_classes: int, title: str) -> None:
    """Plot the 2D PCA
    
    Parameters
    ----------
    df : pd.DataFrame
        Dataset with PCA for each sample (and targets in the 'class' column)
    n_classes : int
        Number of classes of the dataset
    title: str
        Title of the graph
    """
    # Create figure
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(1,1,1)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title(title, fontsize=14)
    # Set colors
    colors = np.array([
        "red","green","blue","yellow","pink","black","orange","purple",
        "beige","brown","gray","cyan","magenta"])[:n_classes]
    # Name the targets
    targets = [i for i in range(n_classes)]
    # Plot a scatter plot 
    for target, color in zip(targets, colors):
        indicesToKeep = df["class"] == target
        ax.scatter(df.loc[indicesToKeep, 'PC-1'], df.loc[indicesToKeep, 'PC-2'], c=color, s=50)

    ax.legend(targets)
    ax.grid()
    plt.show()

In [13]:
def perform_experiment_tsne(df: pd.DataFrame, title: str, n_components: int = 2, init: str = 'pca', random_state: int = 0):
    """Extract PCA from a dataset and plot it
    Parameters
    ----------
    df : pd.DataFrame
        Dataset to perform PCA
    title: str
        Title of the graph  
    """
    # Number of features
    features = [str(i) for i in range(65)]
    x = df.loc[:, features].values
    y = df.loc[:, ['class']]
    print(f"Shape of samples: {x.shape}")
    print(f"Shape of targets: {y.shape}")

    
    model = get_model("t-sne", n_components=n_components, init=init, random_state=random_state)
    rincipal_df = perform_tsne(x, y)
    #print(f"Explained variance: {exp_var}")
    #print(f"Explained variance ratio (by component): {exp_var_ratio}")
    #print(f"Explained variance ratio (sum): {exp_var_ratio.cumsum()[-1]:.3f}%")

    # Plot it
    #c_min, c_max = int(y.min()), int(y.max())
    #plot_2components_PCA(principal_df, c_max+1, title)
    return principal_df

## AnghaBestSeqsSBLP2021

In [14]:
filename = "AnghaBestSeqsSBLP2021.csv"
df = pd.read_csv(filename)
df.tail()

Unnamed: 0.1,Unnamed: 0,name,class,0,1,2,3,4,5,6,...,55,56,57,58,59,60,61,62,63,64
2495,2495,extr_libipw_wx.c_libipw_translate_scan,2,1.0,52.0,0.0,0.0,0.0,0.0,0.0,...,21.0,0.0,0.0,0.0,0.0,37.0,0.0,0.0,0.0,0.0
2496,2496,extr_render.c_test_formats,2,1.0,61.0,0.0,0.0,0.0,0.0,0.0,...,51.0,0.0,15.0,2.0,0.0,22.0,0.0,0.0,0.0,0.0
2497,2497,extr_c-valprint.c_c_val_print,2,1.0,153.0,1.0,0.0,0.0,0.0,0.0,...,63.0,0.0,4.0,0.0,0.0,135.0,0.0,0.0,0.0,0.0
2498,2498,extr_sor.c_tegra_sor_edp_enable,2,1.0,63.0,0.0,0.0,0.0,0.0,0.0,...,25.0,0.0,0.0,0.0,0.0,110.0,0.0,0.0,0.0,0.0
2499,2499,extr_hns_roce_hw_v2.c_hns_roce_v2_query_qp,2,1.0,16.0,0.0,0.0,0.0,0.0,0.0,...,7.0,0.0,0.0,0.0,0.0,34.0,0.0,0.0,0.0,0.0


In [16]:
principal_df = perform_experiment_tsne(df, "t-sne")

Shape of samples: (2500, 65)
Shape of targets: (2500, 1)
            PC-1       PC-2  class
0     -27.608915  14.112516      3
1     118.877090  -5.304456      3
2      -8.395719 -46.136803      3
3    -130.291016  -5.106524      3
4     -24.492390   4.639995      3
...          ...        ...    ...
2495  103.445854  28.449652      2
2496  -38.773670   5.028303      2
2497   98.184425 -36.573742      2
2498  278.387695  -2.411278      2
2499  -37.819492  47.512714      2

[2500 rows x 3 columns]


In [None]:
perform_experiment_pca(df, title="AnghaBestSeqsSBLP2021")

## AnghaLoops

In [None]:
filename = "AnghaLoops.csv"
df = pd.read_csv(filename)
df.tail()

In [None]:
perform_experiment_pca(df, title="AnghaLoops")

## CodeNetBestSeqsSBLP2021_merge_classes

In [None]:
filename = "CodeNetBestSeqsSBLP2021_merge_classes.csv"
df = pd.read_csv(filename)
df.tail()

In [None]:
perform_experiment_pca(df, title="CodeNetBestSeqsSBLP2021_merge_classes")

## CodeNetBestSeqsSBLP2021

In [None]:
filename = "CodeNetBestSeqsSBLP2021.csv"
df = pd.read_csv(filename)
df.tail()

In [None]:
perform_experiment_pca(df, title="CodeNetBestSeqsSBLP2021")

## PCA With POJ-5

In [None]:
filename = "POJ-5.csv"
df = pd.read_csv(filename)
df.tail()

In [None]:
perform_experiment_pca(df, title="POJ-5")

## POJBestSeqsSBLP2021_merge_2classes

In [None]:
filename = "POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
df.tail()

In [None]:
perform_experiment_pca(df, title="POJBestSeqsSBLP2021_merge_2classes")

## POJBestSeqsSBLP2021_merge_Nclasses

In [None]:
filename = "POJBestSeqsSBLP2021_merge_Nclasses.csv"
df = pd.read_csv(filename)
df.tail()

In [None]:
perform_experiment_pca(df, title="POJBestSeqsSBLP2021_merge_Nclasses")