<a href="https://colab.research.google.com/github/mangohehe/DeepGate2/blob/data-learning/notebooks/Data-Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@markdown Git clone DeepGate2 Repo
!git clone https://github.com/mangohehe/DeepGate2.git

Cloning into 'DeepGate2'...
remote: Enumerating objects: 159, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 159 (delta 24), reused 32 (delta 10), pack-reused 86[K
Receiving objects: 100% (159/159), 52.56 MiB | 21.97 MiB/s, done.
Resolving deltas: 100% (40/40), done.
Updating files: 100% (71/71), done.


In [2]:
#@markdown Run to mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

%cd /content/DeepGate2/data/train
%pwd

Mounted at /content/drive
/content/DeepGate2/data/train


'/content/DeepGate2/data/train'

In [None]:
#@markdown Run to unzip training data

%cd /content/DeepGate2/dataset
!tar -jxvf rawaig.tar.bz2

In [4]:
import os
from collections import defaultdict
import pandas as pd
# Import the data_table module
from google.colab import data_table
from collections import defaultdict, Counter
# Enable the DataTable formatter globally
data_table.enable_dataframe_formatter()

RAW_AIG_DIR = "/content/DeepGate2/dataset/rawaig" #@param

In [80]:
#@markdown def analyze_bench_files(bench_dir)

def load_combined_bench_files(bench_dir):
    """Analyze .bench files in the specified directory and return a summary DataFrame."""

    # Initialize a defaultdict to store counts for each circuit
    circuit_data = defaultdict(lambda: {
        'Number of Files': 0, 'Inputs': 0, 'Outputs': 0, 'AND': 0, 'NOT': 0, 'OR': 0,
        'NAND': 0, 'NOR': 0, 'XOR': 0, 'XNOR': 0, 'Others': Counter(), 'Total Gates': 0
    })

    # Check if the directory exists before proceeding
    if not os.path.exists(bench_dir) or not os.path.isdir(bench_dir):
        print(f"Directory {bench_dir} not found or isn't a directory.")
        return pd.DataFrame()

    # Initialize a global counter for unknown instance types
    global_unknown_types = Counter()

    # Iterate through the files in the directory
    for filename in os.listdir(bench_dir):
        # Process only files ending with the .bench extension
        if filename.endswith(".bench"):
            circuit_name = '_'.join(filename.split('_')[:-1])
            circuit_data[circuit_name]['Number of Files'] += 1

            # Initialize counters for gates and pins
            inputs = outputs = and_gates = not_gates = or_gates = nand_gates = nor_gates = xor_gates = xnor_gates = 0
            unknown_types = Counter()

            # Open and parse the .bench file
            file_path = os.path.join(bench_dir, filename)
            try:
                with open(file_path, "r") as f:
                    for line in f:
                        line = line.strip()
                        if line.startswith("INPUT"):
                            inputs += 1
                        elif line.startswith("OUTPUT"):
                            outputs += 1
                        elif "= AND" in line:
                            and_gates += 1
                        elif "= NOT" in line:
                            not_gates += 1
                        elif "= OR" in line:
                            or_gates += 1
                        elif "= NAND" in line:
                            nand_gates += 1
                        elif "= NOR" in line:
                            nor_gates += 1
                        elif "= XOR" in line:
                            xor_gates += 1
                        elif "= XNOR" in line:
                            xnor_gates += 1
                        else:
                            if "=" in line:
                                gate_type = line.split('=')[1].strip().split()[0]
                                unknown_types[gate_type] += 1

                # Aggregate known gate counts
                circuit_data[circuit_name]['Inputs'] += inputs
                circuit_data[circuit_name]['Outputs'] += outputs
                circuit_data[circuit_name]['AND'] += and_gates
                circuit_data[circuit_name]['NOT'] += not_gates
                circuit_data[circuit_name]['OR'] += or_gates
                circuit_data[circuit_name]['NAND'] += nand_gates
                circuit_data[circuit_name]['NOR'] += nor_gates
                circuit_data[circuit_name]['XOR'] += xor_gates
                circuit_data[circuit_name]['XNOR'] += xnor_gates
                circuit_data[circuit_name]['Others'].update(unknown_types)

                # Sum all gates excluding inputs and outputs
                total_gates = (
                    and_gates + not_gates + or_gates + nand_gates +
                    nor_gates + xor_gates + xnor_gates + sum(unknown_types.values())
                )
                circuit_data[circuit_name]['Total Gates'] += total_gates

                # Update the global unknown types counter
                global_unknown_types.update(unknown_types)

            except IOError as e:
                print(f"Error reading file {filename}: {e}")

    # Convert to DataFrame and handle the Others column as a string
    df = pd.DataFrame.from_dict(circuit_data, orient='index')
    df['Others'] = df['Others'].apply(dict)

    # Add a total summary row
    df.loc['Total'] = df.sum(numeric_only=True)
    df.at['Total', 'Number of Files'] = df['Number of Files'].sum()
    df.at['Total', 'Others'] = dict(global_unknown_types)
    df.index.name = 'Circuit'
    df['Circuit'] = df.index
    df = df.loc[:, (df != 0).any(axis=0)]
    df = df.reset_index(drop=True)
    # Log all unknown gate types globally
    print("Global unknown gate types across all circuits:", dict(global_unknown_types))

    return df

import os
from collections import defaultdict
import fnmatch

#@markdown def analyze_fanin_fanout(bench_dir, pattern="log2.*.bench")

def analyze_fanin_fanout(bench_dir, pattern="log2.*.bench"):
    """Analyze fan-in and fan-out for .bench files matching the given pattern."""
    fanin_counts = defaultdict(int)
    fanout_graph = defaultdict(list)

    # Check if the directory exists
    if not os.path.exists(bench_dir) or not os.path.isdir(bench_dir):
        print(f"Directory {bench_dir} not found or isn't a directory.")
        return None

    # Iterate through each file in the directory
    for filename in os.listdir(bench_dir):
        if fnmatch.fnmatch(filename, pattern):
            file_path = os.path.join(bench_dir, filename)

            try:
                with open(file_path, "r") as f:
                    for line in f:
                        line = line.strip()
                        if "=" in line:
                            # Parse the gate and its input connections
                            print(line)
                            gate, expression = line.split("=")
                            gate = gate.strip()
                            inputs = expression.strip().split(",")
                            fanin_counts[gate] = len(inputs)
                            for input_gate in inputs:
                                input_gate = input_gate.strip()
                                fanout_graph[input_gate].append(gate)

            except IOError as e:
                print(f"Error reading file {filename}: {e}")

    # Calculate fan-in and fan-out values
    fanin_values = list(fanin_counts.values())
    fanout_counts = {k: len(v) for k, v in fanout_graph.items()}
    fanout_values = list(fanout_counts.values())

    # Summary statistics
    avg_fanin = sum(fanin_values) / len(fanin_values) if fanin_values else 0
    avg_fanout = sum(fanout_values) / len(fanout_values) if fanout_values else 0

    print(f"Average Fan-In: {avg_fanin:.2f}")
    print(f"Average Fan-Out: {avg_fanout:.2f}")

    return {
        "fanin_counts": fanin_counts,
        "fanout_counts": fanout_counts,
        "avg_fanin": avg_fanin,
        "avg_fanout": avg_fanout
    }


import numpy as np
import pandas as pd

#@markdown def analyze_bench_files(bench_dir, pattern="log2.*.bench")

def load_npz_data_to_df(graphs_file, labels_file):
    """Load and convert the circuit data into a DataFrame."""

    # Load graph data
    graph_data = np.load(graphs_file, allow_pickle=True)
    label_data = np.load(labels_file, allow_pickle=True)

    # Extract graph circuits and label data
    circuits = graph_data['circuits'].item()
    labels = label_data['labels'].item()

    # Create a list to store circuit information
    circuit_info = []

    # Iterate over circuits and gather details
    for circuit_name, graph_info in circuits.items():
        x = graph_info['x']  # Node feature matrix
        edge_index = graph_info['edge_index']  # Edge index matrix

        circuit_labels = labels[circuit_name]
        tt_pair_index = circuit_labels['tt_pair_index']  # Index pairs for truth table comparison
        tt_dis = circuit_labels['tt_dis']  # Truth table distances

        # Gather circuit data into a dictionary
        circuit_info.append({
            'Circuit Name': circuit_name,
            'Number of Nodes': x.shape[0],
            'Number of Edges': edge_index.shape[1],
            'Number of TT Pairs': tt_pair_index.shape[0],
            'Number of TT Distances': tt_dis.shape[0]
        })

    # Convert to a DataFrame

    df = pd.DataFrame(circuit_info)

    return df

# Load .bench files

In [81]:
#@markdown load bench files into df
df = load_combined_bench_files(RAW_AIG_DIR)


Global unknown gate types across all circuits: {}


In [82]:
df['Circuit']

0             b22_C
1           sin_syn
2        s13207_syn
3      mem_ctrl_syn
4             b18_C
           ...     
105       b01_opt_C
106      square_syn
107           b09_C
108       b02_opt_C
109           Total
Name: Circuit, Length: 110, dtype: object

In [84]:
Circuit_Name = 'log2_syn' #@param

df[df['Circuit'] == Circuit_Name]


Unnamed: 0,Number of Files,Inputs,Outputs,AND,NOT,Others,Total Gates,Circuit
26,336.0,29689.0,1463.0,130826.0,159275.0,{},290101.0,log2_syn


In [8]:
df.head(20)

Unnamed: 0_level_0,Number of Files,Inputs,Outputs,AND,NOT,Others,Total Gates
Circuit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
b22,209.0,12892.0,1414.0,29516.0,38704.0,{},68220.0
sin,52.0,3006.0,181.0,15207.0,17900.0,{},33107.0
s13207,20.0,1483.0,330.0,2184.0,2065.0,{},4249.0
mem,653.0,47359.0,4144.0,78469.0,80196.0,{},158665.0
b18,686.0,61615.0,6752.0,129655.0,113525.0,{},243180.0
wbscope,1323.0,16365.0,20812.0,110800.0,53296.0,{},164096.0
reed,474.0,31709.0,3484.0,87147.0,89663.0,{},176810.0
tv80,72.0,3913.0,350.0,9333.0,11500.0,{},20833.0
arbiter,218.0,13771.0,1226.0,19520.0,28791.0,{},48311.0
b20,150.0,6516.0,1460.0,13515.0,19683.0,{},33198.0


In [None]:
#@markdown Check the circuit
Number_of_files = "010" #@param
print(Circuit_Name)

circuit_name_pattern="{Circuit_Name}_{Number_of_files}.bench".format(Circuit_Name=Circuit_Name, Number_of_files=Number_of_files)
print(circuit_name_pattern)

fanin_fanout_data = analyze_fanin_fanout(RAW_AIG_DIR, pattern=circuit_name_pattern)

fanin_fanout_data

# Load NPZ files

In [17]:
#@markdown Copy and unzip pre-generated npz files

import zipfile
import os

# Path to the zip file in Google Drive
zip_path = '/content/drive/My Drive/DeepGate2/train.zip'
train_data = '/content/DeepGate2/data/train'
train_data_dir = '/content/DeepGate2/data'

# Remove the folder if it exists
if os.path.exists(train_data):
    !rm -rf {train_data}

# Unzipping the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(train_data_dir)

# Replace these file paths with the actual paths to your `.npz` files
graphs_file = '/content/DeepGate2/data/train/graphs.npz'
labels_file = '/content/DeepGate2/data/train/labels.npz'


In [21]:
import numpy as np

# Load the .npz file
graph_npz_file = np.load(graphs_file, allow_pickle=True)

# List all keys (variable names) inside the .npz file
print("Keys in the .npz file:", list(graph_npz_file.keys()))

# Access specific data arrays using their keys
circuits_data = graph_npz_file['circuits'].item()


Keys in the .npz file: ['circuits']


In [30]:
print(len(circuits_data.keys()))
count = 0
for circuit_name, circuit_info in circuits_data.items():
    print(f"Circuit: {circuit_name}")
    print(f"Number of Nodes: {circuit_info['x'].shape[0]}")
    print(f"Number of Edges: {circuit_info['edge_index'].shape[1]}")
    count += 1
    if count > 2:
        break


10000
Circuit: pci_syn_031
Number of Nodes: 479
Number of Edges: 2
Circuit: arbiter_syn_111
Number of Nodes: 207
Number of Edges: 2
Circuit: pci_syn_074
Number of Nodes: 175
Number of Edges: 2


In [18]:
#@markdown Load {graphs, labels} to DF
circuit_df = load_npz_data_to_df(graphs_file, labels_file)
circuit_df.describe()

Unnamed: 0,Number of Nodes,Number of Edges,Number of TT Pairs,Number of TT Distances
count,10000.0,10000.0,10000.0,10000.0
mean,347.0397,2.0,87.8433,87.8433
std,311.082826,0.0,283.730719,283.730719
min,18.0,2.0,0.0,0.0
25%,167.0,2.0,24.0,24.0
50%,257.0,2.0,44.0,44.0
75%,403.0,2.0,82.0,82.0
max,3676.0,2.0,11938.0,11938.0


In [19]:
#@markdown Find a circuit
circuit_name_to_find = 'c17_syn_000' #@param
filtered_circuit = circuit_df[circuit_df['Circuit Name'] == circuit_name_to_find]
filtered_circuit

Unnamed: 0,Circuit Name,Number of Nodes,Number of Edges,Number of TT Pairs,Number of TT Distances
7023,c17_syn_000,18,2,0,0
