# Load libraries

In [None]:
import os
import tqdm
from codes.bb import *
import pandas as pd
import networkx as nx
import plotly.express as px
import matplotlib.pyplot as plt

# Load data

In [None]:
df = pd.read_csv('results.csv')
df['clean_log'] = df['clean_log'].astype(bool)
print(df.info())

# extract DMAX and PDB ID from fnmr
df['PDB'] = df['fnmr'].apply(lambda x: x.split('/')[-1].replace('.nmr', ''))
df['DMAX'] = df['fnmr'].apply(lambda x: x.split('_')[-1].split('/')[0])

# change column order
cols = [
    'PDB', 'DMAX', 'fnmr', 'verbose', 'clean_log', 'tmax',  # parameters
    'nnodes', 'lenE', 'lenS',  # sizes
    'costRX', 'costSB', 'costBB', 'costPT',  # costs
    'timeSB', 'timeBB', 'timePT',  # times
    'timeoutBB', 'timeoutPT',  # timeouts
]
df = df[cols]

# change column names
df.rename(columns={'nnodes': '|V|', 'lenE': '|E|', 'lenS': '|S|'}, inplace=True)

# sort by DMAX and nnodes
df = df.sort_values(by=['DMAX', '|V|'])

df.head()

# Exploratory analysis

In [None]:
# number of timeouts
print('num timeoutBB:', df['timeoutBB'].sum().astype(int))
print('num timeoutPT:', df['timeoutPT'].sum().astype(int))
print()

# count number of pdb_id per DMAX
print('Number of pdb_id per DMAX:')
print(df.groupby('DMAX')['pdb_id'].count())

In [None]:
# pdb files description
cols = ['PDB', '|V|', '|E|', '|S|'] # columns to show
DMAX = df['DMAX'].unique()
for dmax in DMAX:
    dfA = df[df['DMAX']==dmax][cols].reset_index(drop=True)
    print(f'  - dfA.shape: {dfA.shape}')
    tex = dfA.to_latex(index=False,
                column_format='lrrc',
                header=cols, # list of column names
                escape=False,
                caption=f'Number of nodes, edges and segments for each PDB file (dmax={dmax}).',
                label=f'tab:pdb_dmax_{dmax}',
                multicolumn=False,
                multicolumn_format='c')
    # save tex to file
    with open(f'./latex/tables/pdb_dmax_{dmax}.tex', 'w') as f:
        f.write(tex)

In [None]:
df50 = df[df['DMAX']=='50']
df60 = df[df['DMAX']=='60']

print('df50.shape:', df50.shape)
print('df50.shape:', df60.shape)

cols = ['|V|', '|E|', '|S|']
print(df50[cols].describe())
print(df60[cols].describe())

In [None]:
# plot number of edges and segments
df50 = df[df['DMAX']=='50'][['PDB', '|V|', '|E|', '|S|']].reset_index(drop=True)
df50.rename(columns={'|E|': '|E|_50', '|S|': '|S|_50'}, inplace=True)
df60 = df[df['DMAX']=='60'][['PDB', '|E|', '|S|']].reset_index(drop=True)
df60.rename(columns={'|E|': '|E|_60', '|S|': '|S|_60'}, inplace=True)
dfA = df50.merge(df60, on='PDB')
fig = px.scatter(dfA, x='|V|', y=['|E|_50', '|E|_60','|S|_50','|S|_60'], hover_name='PDB', log_x=True, log_y=True)
fig.show()
# export figure
fig.write_image('./latex/figures/edges_segments.png')

In [None]:
# plot number of edges and segments
fig = px.scatter(df, x='|V|', y='|E|', color='DMAX', hover_name='PDB', log_x=True, log_y=True)
# export figure
fig.write_image('./latex/figures/vertices_edges.png')

### Plot NMR Graph

In [None]:
def plot_graph(G, fnmr, savefig=False):
    print(fnmr)
    pos = nx.spring_layout(G)    
    node_color = ['y' if ':' in node else 'r' for node in G.nodes()]
    node_size = [800 if ':' in node else 300 for node in G.nodes()]
    nx.draw_networkx_nodes(G, pos, 
        node_size=node_size, 
        node_shape='o', 
        node_color=node_color)
    nx.draw_networkx_labels(G, pos, font_color='k', font_size=8)
    nx.draw_networkx_edges(G, pos)
    if savefig:
        plt.savefig(fnmr.replace('.nmr', '.pdf'), format='pdf')
    plt.show()

In [None]:
wdir = 'DATA_TEST'
for fn in os.listdir(wdir):
    if not fn.startswith('testRAND') or not fn.endswith('.nmr'):
        continue
    fn = os.path.join(wdir, fn)
    nmr = NMR(fn)
    E, S = nmr.E, nmr.S
    G = nmr.ordering_graph
    plot_graph(G, fn, True)

### Read LOG Files

In [None]:
WDIR = ['DATA_EPSD_00_DMAX_50', 'DATA_EPSD_00_DMAX_60']
df = []
for wdir in WDIR:
    print('Processing wdir=%s' % wdir)
    dmax = int(wdir.split('_')[-1])
    for flog in tqdm.tqdm(sorted(os.listdir(wdir))):
        if not flog.endswith('.log'):
            continue
        pid = flog.replace('.log','')
        flog = os.path.join(wdir, flog)
        # check if the problem was pickled        
        df_log = {'pid': pid, 'dmax':dmax}
        with open(flog, 'r') as fid:
            for row in fid:
                row = row.replace(':','').replace('\n','').split(' ')
                field = row[1]
                if 'fnmr' in field:
                    value = row[-1]
                elif 'timeoutBB' in field:
                    value = int(row[-1])
                elif 'time' in field:
                    value = float(row[-1])
                else:
                    value = int(row[-1])
                df_log[field] = value
        df.append(df_log)
df = pd.DataFrame(df)
df['gapRL'] = (df['costSB'] - df['costRX']) / df['costRX']
df['gapBB'] = (df['costSB'] - df['costBB']) / df['costBB']
df['gapPT'] = (df['costSB'] - df['costPT']) / df['costPT']
C = {'timeoutBB':'tOutBB','timeoutPT':'tOutPT'}
df.rename(columns=C, inplace=True)
df.drop(['fnmr'],axis=1,inplace=True)
fname = 'results.xlsx'
print('Save', fname)
df.to_excel(fname,index=False)
df.head()

In [None]:
# time out cases
dfA = df[df['timeoutPT'] + df['timeoutBB'] >= 1]
dfA = dfA[['pid','dmax','costRELAX','costSBBU','costBB','costPT','gapRL','timeBB','timePT']]
dfA

In [None]:
df.groupby(by=['pid','dmax']).mean()

In [None]:
df50 = df[df['dmax']==50]
df60 = df[df['dmax']==60]

### Instance Analysis

In [None]:
fn  = 'DATA_EPSD_00_DMAX_50/1n6t.nmr'
nmr = NMR(fn)
E, S = nmr.E, nmr.S
print('len(E):',len(E),'len(S):',len(S))

In [None]:
G = nmr.ordering_graph
plot_graph(G, fn, False)

In [None]:
order, costBF = order_brute(nmr)
print('costBF=',costBF)

In [None]:
x, y, z = [], [], []
for sid in S:
    s = S[sid]
    x.append(s.i)
    y.append(s.j)
    z.append('nmr')
    x.append(s.i)
    y.append(s.i)
    z.append('x==y')
df = {'x':x, 'y':y, 'z':z}
px.scatter(df, x='x', y='y', color='z')

In [None]:
for eid in E:
    e = E[eid]
    print(eid, e.i, e.j, e.sid)

In [None]:
S[1].i, S[1].j

In [None]:
# degree of each segment
degree = {}
for sid in S:
    s: NMRSegment = S[sid]
    d = len(s.eid)
    if d not in degree:
        degree[d] = 0
    degree[d] += 1
# convert from dict to list
degree = sorted([(d, degree[d]) for d in degree])


In [None]:
x, y = list(zip(*degree))
df = {'segment degree':x,'#occurrencies':y}
px.scatter(df,x='segment degree',y='#occurrencies')

In [None]:
# any pair of segments have the same edges? 
K = set()
for sid in tqdm.tqdm(S):
    s: NMRSegment = S[sid]
    K.add(tuple(sorted(eid for eid in s.eid)))
print('#K:',len(K), '#S:', len(S))

In [None]:
# remover de (edge) 'a' de s.sid se existir (edge) 'b' s.sid tq b \subset a.
import copy
W = copy.deepcopy(S)
df = {'sid':[], 'degOld':[], 'degNew':[]}
for sid in tqdm.tqdm(sorted(W)):
    s : NMRSegment = W[sid]
    s_eid = sorted(s.eid)
    R = set() # eid to be removed
    for eidA in s_eid:
        a: NMREdge = E[eidA]
        for eidB in s_eid:
            if eidA == eidB:
                continue
            b: NMREdge = E[eidB]
            # a contains b, so b precedes a
            if a.i <= b.i and b.j <= a.j:
                R.add(eidA)
                break
    s.eid = s.eid - R
    df['sid'].append(sid)
    df['degOld'].append(len(s_eid))
    df['degNew'].append(len(s.eid))


In [None]:
px.scatter(df, x='degOld', y='degNew', text='sid')

In [None]:
sid = 623
s = S[sid]
for eid in s.eid:
    e: NMREdge = E[eid]
    print('(%d, %d)' % (e.i, e.j))


In [None]:
sid = 623
s = W[sid]
for eid in s.eid:
    e: NMREdge = E[eid]
    print('(%d, %d)' % (e.i, e.j))


In [None]:
s.i, s.j