# Generating GP Labels
An algorithm for generating labels that will be used by a ML model to predict GPs. 

* We will test the generated labels to find out percentage of correct/incorrect labels.

## Import Libraries

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import so4gp as sgp

from sklearn.preprocessing import MinMaxScaler

## Dataset
Retrieving data set and the features

In [17]:
ds = sgp.CluDataGP('../data/DATASET.csv', e_prob=0)
#ds = sgp.CluDataGP('../data/c2k_02k.csv', e_prob=0)
#ds = sgp.CluDataGP('../data/breast_cancer.csv', e_prob=0)

features = np.array(ds.data, dtype=np.float64)
features

array([[30.,  3.,  1., 10.,  2.,  4.],
       [35.,  2.,  2.,  8.,  2.,  4.],
       [40.,  4.,  2.,  7.,  2.,  4.],
       [50.,  1.,  1.,  6.,  2.,  4.],
       [52.,  7.,  1.,  2.,  2.,  4.]])

## Generate Labels and Create a DataFrame

1. We use net-win matrix to generate the labels.
2. We create a DataFrame to store the updated data

In [18]:
def generate_gp_labels_v2(netwin_mat, data_gp):
    # 1. Generate labels
    labels = []
    netwin_mat[netwin_mat == -1] = 2  # encode -1 as 2 in the net-win matrix

    # print(sel_nwm.shape[1])
    for i in range(netwin_mat.shape[1]): # all columns
        temp_label = ''.join(str(x) for x in sel_nwm[:, i]) 
        labels.append(temp_label)
        
    # 2. Add labels to data-frame
    # 2a. get the titles
    column_names = []
    for col_title in data_gp.titles:
        try:
            col = str(col_title.value.decode())
        except AttributeError:
            col = str(col_title[1].decode())
        column_names.append(col)
    column_names.append('GP Label')
    #print(column_names)

    # 2b. add labels column to data set
    col_labels = np.array(labels, dtype='U')
    col_labels = col_labels[:, np.newaxis]
    new_data = np.concatenate([features, col_labels], axis=1)

    # 2c. create data-frame
    df = pd.DataFrame(new_data, columns=column_names)
    
    return df



def generate_gp_labels_v1(netwin_mat, data_gp):
    # 1. Generate labels
    labels = []
    # print(sel_nwm.shape[1])
    for i in range(netwin_mat.shape[1]): # all columns
        temp_label = ''
        gi = 0
        for w in sel_nwm[:, i]:
            if w > 0:
                temp_label += str(gi) + '+'
            elif w < 0:
                temp_label += str(gi) + '-'
            gi += 1
        labels.append(temp_label)
        
    # 2. Add labels to data-frame
    # 2a. get the titles
    column_names = []
    for col_title in data_gp.titles:
        try:
            col = str(col_title.value.decode())
        except AttributeError:
            col = str(col_title[1].decode())
        column_names.append(col)
    column_names.append('GP Label')
    #print(column_names)

    # 2b. add labels column to data set
    col_labels = np.array(labels, dtype=object)
    col_labels = col_labels[:, np.newaxis]
    new_data = np.concatenate([features, col_labels], axis=1)

    # 2c. create data-frame
    df = pd.DataFrame(new_data, columns=column_names)
    
    return df

In [19]:
# print(ds.gradual_items)
# print(ds.net_win_mat)
sel_gis = ds.gradual_items[::2]  # select even rows only 
sel_nwm = np.ndarray.copy(ds.net_win_mat[::2])  # select even rows/vectors only (each GI has two net-win vectors, one is a complement of the other)
print(sel_nwm)

df_1 = generate_gp_labels_v2(sel_nwm, ds)
df_1

[[ 1  1  0 -1 -1]
 [ 0  1 -1  1 -1]
 [ 1 -1 -1  1  1]
 [-1 -1  0  1  1]]


Unnamed: 0,Age,Salary,Cars,Expenses,Invalid1,Invalid2,GP Label
0,30.0,3.0,1.0,10.0,2.0,4.0,1012
1,35.0,2.0,2.0,8.0,2.0,4.0,1122
2,40.0,4.0,2.0,7.0,2.0,4.0,220
3,50.0,1.0,1.0,6.0,2.0,4.0,2111
4,52.0,7.0,1.0,2.0,2.0,4.0,2211


In [20]:
# print(ds.gradual_items)
# print(ds.net_win_mat)
sel_gis = ds.gradual_items[::2]  # select even rows only 
sel_nwm = np.ndarray.copy(ds.net_win_mat[::2])  # select even rows/vectors only (each GI has two net-win vectors, one is a complement of the other)
print(sel_nwm)

df_2 = generate_gp_labels_v1(sel_nwm, ds)
df_2

[[ 1  1  0 -1 -1]
 [ 0  1 -1  1 -1]
 [ 1 -1 -1  1  1]
 [-1 -1  0  1  1]]


Unnamed: 0,Age,Salary,Cars,Expenses,Invalid1,Invalid2,GP Label
0,30.0,3.0,1.0,10.0,2.0,4.0,0+2+3-
1,35.0,2.0,2.0,8.0,2.0,4.0,0+1+2-3-
2,40.0,4.0,2.0,7.0,2.0,4.0,1-2-
3,50.0,1.0,1.0,6.0,2.0,4.0,0-1+2+3+
4,52.0,7.0,1.0,2.0,2.0,4.0,0-1-2+3+


## Analysis
(to do) find subsets of GPs and estimate support based on the frequency of occurrence.

In [21]:
gp_labels = df_2['GP Label']#.unique()
print(df_2['GP Label'].nunique())
gp_labels

5


0      0+2+3-
1    0+1+2-3-
2        1-2-
3    0-1+2+3+
4    0-1-2+3+
Name: GP Label, dtype: object

### Algorithm for aggregating GPs
We try to estimate GPs by aggregating labels with similar GIs


In [22]:
gp_labels = df_1['GP Label']

def estimate_net_win_mat(gp_labels):
    gp_mat = [list(str(obj)) for obj in gp_labels]
    #gp_mat = []
    #for obj in gp_labels:
        # temp_arr = list(str(obj))  # [int(x) for x in str(obj)]
    #    gp_mat.append(list(str(obj)))
    gp_mat = np.array(gp_mat, dtype=int)
    gp_mat[gp_mat == 2] = -1
    return gp_mat

In [23]:
gp_mat = estimate_net_win_mat(gp_labels)
print(gp_mat)

[[ 1  0  1 -1]
 [ 1  1 -1 -1]
 [ 0 -1 -1  0]
 [-1  1  1  1]
 [-1 -1  1  1]]


In [24]:
import gc

def inv(g_item):
    if g_item[1] == '+':
        temp = tuple([g_item[0], '-'])
    else:
        temp = tuple([g_item[0], '+'])
    return temp


def remove_existing_gi(cand_gp):
    cols = []
    new_cand = set()
    for gi_obj in cand_gp:
        if not cols:
            cols.append(gi_obj[0])
            new_cand.add(gi_obj)
        elif gi_obj[0] not in cols:
            cols.append(gi_obj[0])
            new_cand.add(gi_obj)
        # else:
        #    print(str(gi_obj[0]) + ' is already in ' + str(cand_gp))
    return new_cand


def gen_apriori_candidates(lst_gi):
    res = []
    all_candidates = []
    if len(lst_gi) < 2:
        return []
    try:
        set_gi = [{x} for x in lst_gi]
    except TypeError:
        set_gi = [set(x) for x in lst_gi]

    for i in range(len(lst_gi) - 1):
        for j in range(i + 1, len(lst_gi)):
            try:
                gi_i = {lst_gi[i]}
                gi_j = {lst_gi[j]}
                gi_o = {lst_gi[0]}
            except TypeError:
                gi_i = set(lst_gi[i])
                gi_j = set(lst_gi[j])
                gi_o = set(lst_gi[0])
            gp_cand = gi_i | gi_j  # set union i.e., gi_i.union(gi_j)
            gp_cand = remove_existing_gi(gp_cand)  # remove gi from same column

            inv_gp_cand = {inv(x) for x in gp_cand}
            if (len(gp_cand) == len(gi_o) + 1) and (not (all_candidates != [] and gp_cand in all_candidates)) \
                    and (not (all_candidates != [] and inv_gp_cand in all_candidates)):
                is_valid_candidate = True
                for k in gp_cand:
                    try:
                        k_set = {k}
                    except TypeError:
                        k_set = set(k)
                    gp_cand_2 = gp_cand - k_set
                    inv_gp_cand_2 = {inv(x) for x in gp_cand_2}
                    if not gp_cand_2 in set_gi and not inv_gp_cand_2 in set_gi:
                        is_valid_candidate = False
                        break
                if is_valid_candidate:
                    # m = R[i][1] * R[j][1]
                    # t = float(np.sum(m)) / float(n * (n - 1.0) / 2.0)
                    # if t > sup:
                    #    res.append([gp_cand, m])
                    res.append(gp_cand)
                all_candidates.append(gp_cand)
                gc.collect()
    return res


def calculate_support(gp_mat, gi_obj):
    n = gp_mat.shape[0]
    cols = [x[0] for x in gi_obj]
    dirs = np.array([1 if x[1].decode() == '+' else -1 for x in gi_obj])
    mat = gp_mat[:, cols]
    match_rows = np.where(((mat[:, 0] == dirs[0]) & (mat[:, 1] == dirs[1])))
    sup = match_rows[0].size/n

    # print(gi_obj)
    # print(cols)
    # print(dirs)
    # print(mat[:])
    # print(match_rows)
    # print(match_rows[0].size)
    # print(sup)
    # print("\n")
    return sup


def estimate_gps(gp_mat, min_sup):

    patterns = []
    """:type patterns: GP list"""
    str_winner_gps = []
    # n = d_set.attr_size
    # valid_gps = d_set.valid_gps

    valid_gps = []
    n = gp_mat.shape[1]
    for a in range(n):
        pos = np.array((a, '+'), dtype='i, S1')
        neg = np.array((a, '-'), dtype='i, S1')
        valid_gps.append(pos.tolist())
        valid_gps.append(neg.tolist())

    while len(valid_gps) > 0:
    # for x in range(3):
        valid_gps = gen_apriori_candidates(valid_gps)
        print("candidates generated")
        i = 0
        while i < len(valid_gps) and valid_gps != []:
            gi_tuple = valid_gps[i]
            sup = calculate_support(gp_mat, gi_tuple)
            if sup < min_sup:
                del valid_gps[i]
            else:
                gp = sgp.GP()
                """:type gp: GP"""
                for obj in valid_gps[i]:
                    gi = sgp.GI(obj[0], obj[1].decode())
                    """:type gi: GI"""
                    gp.add_gradual_item(gi)
                gp.set_support(sup)
                patterns.append(gp)
                # str_winner_gps.append(gp.print(d_set.titles))
                i += 1
    return str_winner_gps, patterns

In [26]:
y, gps = estimate_gps(gp_mat, 0.2)
for gp in gps:
    print(str(gp.to_string()) + ': ' + str(gp.support))

candidates generated
candidates generated
candidates generated
candidates generated
['0+', '1+']: 0.2
['2+', '0+']: 0.2
['2-', '0+']: 0.2
['0+', '3-']: 0.4
['1+', '0-']: 0.2
['1-', '0-']: 0.2
['2+', '0-']: 0.4
['3+', '0-']: 0.4
['2+', '1+']: 0.2
['2-', '1+']: 0.2
['3+', '1+']: 0.2
['1+', '3-']: 0.2
['1-', '2+']: 0.2
['1-', '2-']: 0.2
['1-', '3+']: 0.2
['2+', '3+']: 0.4
['2+', '3-']: 0.2
['2-', '3-']: 0.2
['2+', '0+', '1+']: 0.2
['2-', '1+', '0+']: 0.2
['0+', '1+', '3-']: 0.2
['2+', '0+', '3-']: 0.2
['0+', '2-', '3-']: 0.2
['2+', '1+', '0-']: 0.2
['3+', '1+', '0-']: 0.2
['1-', '2+', '0-']: 0.2
['1-', '3+', '0-']: 0.2
['2+', '3+', '0-']: 0.4
['2+', '3+', '1+']: 0.4
['2+', '1+', '3-']: 0.2
['2+', '1-', '3+']: 0.2
['2-', '1+', '3-']: 0.2
['2+', '0+', '1+', '3-']: 0.2
['3-', '2-', '1+', '0+']: 0.2
['2+', '3+', '1+', '0-']: 0.4
['2+', '1-', '3+', '0-']: 0.2
