In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('../')

In [93]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cosine

## Setup Dataset

In [385]:
seed = 43

In [403]:
def generate_data(seed,noise=0.1):
    np.random.seed(seed)
    num_data_points = 10000
    x = []
    for i in range(num_data_points):
        choice = np.random.random() 
        if choice < 1/3:
            x.append([np.random.uniform(0,0.25),np.random.uniform(0,0.25)])
        elif choice < 1/2:
            x.append([np.random.uniform(0,0.25),np.random.uniform(0.75,1.0)])
        elif choice < 2/3:
            x.append([np.random.uniform(0.75,1.0),np.random.uniform(0,0.25)])
        else:
            x.append([np.random.uniform(0.75,1.0),np.random.uniform(0.75,1.0)])
    x = np.array(x)

    c = []
    y = []
    for i in range(num_data_points):
        x1, x2 = x[i] 
        c1 = int(x1<=1/4)
        c2 = int(x1 >= 3/4)
        c3 = int(x2 <= 1/4)
        c4 = int(x2 >= 3/4)

        if np.random.random() < noise:
            c1 = 1-c1 
        if np.random.random() < noise:
            c2 = 1-c2 
        if np.random.random() < noise:
            c3 = 1-c3
        if np.random.random() < noise:
            c4 = 1-c4

        y1 = int(min(x1,x2) <= 1/4)
        y2 = int(max(x1,x2) >= 3/4)
        c.append([c1,c2,c3,c4])
        y.append([y1,y2])
    c = np.array(c)
    y = np.array(y)
    return x,y,c


## Create Concept-Based Models

In [407]:
def find_most_similar_groups(x,y,c):
    num_data_points = len(x) 
    c_train = c[:int(4/5*num_data_points)]
    c_test = c[int(4/5*num_data_points):]
    y_train = y[:int(4/5*num_data_points)]
    y_test = y[int(4/5*num_data_points):]

    coeffs = []

    for i in range(y_train.shape[1]):
        linear_classifier = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3,alpha=0.1)
        linear_classifier.fit(c_train, y_train[:,i])

        predictions = linear_classifier.predict(c_test)
        accuracy = accuracy_score(y_test[:,i], predictions)
        coeffs.append(linear_classifier.coef_[0])

    coeffs = np.array(coeffs)
    print(coeffs.shape)

    most_similar = {}
    for i in range(c_test.shape[1]):
        for j in range(c_test.shape[1]):
            u = coeffs[:,i]
            v = coeffs[:,j]
            cosine_similarity = 1 - cosine(u, v)

            if i not in most_similar:
                most_similar[i] = (i,-1)
            if j not in most_similar:
                most_similar[j] = (j,-1)

            if i != j and most_similar[i][1] < cosine_similarity:
                most_similar[i] = (j,cosine_similarity)
            if i != j and most_similar[j][1] < cosine_similarity:
                most_similar[j] = (i,cosine_similarity)
    return most_similar

In [408]:
def find_most_similar_groups_bad(x,y,c):
    c_x_combo = np.array([np.append(c[i],x[i]) for i in range(len(c))])
    num_data_points = len(x) 
    x_train = c_x_combo[:int(4/5*num_data_points)]
    x_test = c_x_combo[int(4/5*num_data_points):]
    y_train = y[:int(4/5*num_data_points)]
    y_test = y[int(4/5*num_data_points):]

    coeffs = []

    for i in range(y_train.shape[1]):
        linear_classifier = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3,alpha=0.1)
        linear_classifier.fit(x_train, y_train[:,i])
        fixed_coef_indices = list(range(c.shape[1]))  # Indices of features for which coefficients will be fixed
        fixed_coef_values = [np.random.random() for i in range(c.shape[1])]
        for idx, value in zip(fixed_coef_indices, fixed_coef_values):
            linear_classifier.coef_[0][idx] = value
        
        linear_classifier.fit(np.delete(x_train, fixed_coef_indices, axis=1), y_train[:,i])

        predictions =  linear_classifier.predict(np.delete(x_test, fixed_coef_indices, axis=1))
        accuracy = accuracy_score(y_test[:,i], predictions)
        coeffs.append(fixed_coef_values)

    coeffs = np.array(coeffs)
    most_similar = {}
    for i in range(c.shape[1]):
        for j in range(c.shape[1]):
            u = coeffs[:,i]
            v = coeffs[:,j]
            cosine_similarity = 1 - cosine(u, v)

            if i not in most_similar:
                most_similar[i] = (i,-1)
            if j not in most_similar:
                most_similar[j] = (j,-1)

            if i != j and most_similar[i][1] < cosine_similarity:
                most_similar[i] = (j,cosine_similarity)
            if i != j and most_similar[j][1] < cosine_similarity:
                most_similar[j] = (i,cosine_similarity)
    return most_similar

## Stability Tests

In [409]:
for seed in [42,43,44]:
    x,y,c = generate_data(seed)
    good_groups = find_most_similar_groups(x,y,c)
    bad_groups = find_most_similar_groups_bad(x,y,c)

(2, 4)
(2, 4)
(2, 4)


## Noise Tests

In [410]:
for noise in [0,0.2,0.4,0.5]:
    x,y,c = generate_data(44,noise=noise)
    good_groups = find_most_similar_groups(x,y,c)
    bad_groups = find_most_similar_groups_bad(x,y,c)
    print(good_groups)
    print(bad_groups)

(2, 4)
{0: (2, 0.9999999842074139), 1: (3, 0.9999999838970476), 2: (0, 0.9999999842074139), 3: (1, 0.9999999838970476)}
{0: (2, 0.9991897806714082), 1: (3, 0.9992840733680814), 2: (0, 0.9991897806714082), 3: (1, 0.9992840733680814)}
(2, 4)
{0: (2, 0.9973824798039452), 1: (3, 0.9995112074316401), 2: (0, 0.9973824798039452), 3: (1, 0.9995112074316401)}
{0: (2, 0.9991897806714082), 1: (3, 0.9992840733680814), 2: (0, 0.9991897806714082), 3: (1, 0.9992840733680814)}
(2, 4)
{0: (2, 0.9943091539198102), 1: (3, 0.9578262852211514), 2: (0, 0.9943091539198102), 3: (1, 0.9578262852211514)}
{0: (2, 0.9991897806714082), 1: (3, 0.9992840733680814), 2: (0, 0.9991897806714082), 3: (1, 0.9992840733680814)}
(2, 4)
{0: (2, 0.7999999999999892), 1: (2, 0.707106781186552), 2: (0, 0.7999999999999892), 3: (1, -0.7071067811865586)}
{0: (2, 0.9991897806714082), 1: (3, 0.9992840733680814), 2: (0, 0.9991897806714082), 3: (1, 0.9992840733680814)}
