In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import scipy.stats as stats

import seaborn as sns
import json
import openai
import random
import os
import collections
import itertools
import copy
import dataloader
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
import netgraph
import scipy
import ast

MEDIUM_SIZE = 18
SMALL_SIZE = 0.85 * MEDIUM_SIZE
BIGGER_SIZE = 1.5 * MEDIUM_SIZE

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

with open('params.json') as f:
    params = json.load(f)

openai.api_key = params['OPENAI_API_KEY']
openai.organization = params['OPENAI_ORG']


def get_response(prompt, model='gpt-4-1106-preview', temperature=0.9, system_prompt="You are mimicking a real-life person who wants to make friends."):
    result = openai.ChatCompletion.create(
    model=model,
    temperature=temperature,
    messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
    ])

    return result.choices[0]['message']['content']


def summarize_results(filename):
    with open(filename) as f:
        lines = f.read().splitlines()

    data = []

    for line in lines:

        data.append(json.loads(line))

    reason_list = collections.defaultdict(list)

    for d in data:        
        for results in d["results"]:
            for result in results:
                if result and 'reason' in result.keys():
                    reason_list[d['temperature']].append(result['reason'])

    for k, v in sorted(reason_list.items()):    
        print(f'For temperature {k} the top 3 results are:')
        prompt = f"""
        # Task
        You are given a list of reasons and your task is to summarize them. You must identify the general pattern in the reasons and summarize them in a few sentences.
        You should avoid identifying specific persons.

        # Input
        The input is a list of reasons. The list is given below after chevrons:
        <REASONS>
        {json.dumps(random.sample(v, 50), indent=4)}
        </REASONS>
        """

        ans = get_response(prompt, temperature=0.9, system_prompt="You are a helpful assistant")

        print(ans)


def print_reasons(filename):
    with open(filename) as f:
        lines = f.read().splitlines()

    data = []

    for line in lines:

        data.append(json.loads(line))

    for d in data:
        if d["n"] == 50 and d["simulation"] == 0:
            print(f'n = {d["n"]}, simulation = {d["simulation"]}, temperature = {d["temperature"]}')
            
            for result in d["results"]:
                
                    try:
                        print('\t' + result['reason'])
                    except:
                        pass

def network_growth(G0, temperature, num_choices=1, method='llm', num_samples=-1, num_nodes_samples=-1):
    # Set seed
    random.seed(0)
    np.random.seed(0)    

    # Copy the ground truth graph
    G = G0.copy()
   
    Gs = [G.copy()]

    profiles = nx.get_node_attributes(G, 'features')

    # Edges to drop
    dropped_edges = []

    if num_nodes_samples > 0:
        nodes = random.sample(G.nodes(), min(len(G), num_nodes_samples))
    else:
        nodes = G.nodes()

    # Drop one neighbor for each node
    for v in nodes:
        dropped_v_edges = []
        for _ in range(num_choices):
            if len(list(G.neighbors(v))) > 0:
                
                while True:
                    u = random.choice(list(G.neighbors(v)))
                    if (v, u) not in dropped_edges:
                        dropped_v_edges.append((v, u))
                        G.remove_edge(v, u)
                        break

        dropped_edges.append(dropped_v_edges)

    Gs = [G.copy()]
    results = []
    candidates = []


    for i, t in enumerate(nodes):

        if method == 'llm':
            result, candidate = select_neighbor(G, t, profiles, temperature, num_choices=len(dropped_edges[i]), dropped_nodes=[u for (_, u) in dropped_edges[i]], num_samples=num_samples)

            if result:
                for r in result:
                    v = r['name']
                    r['edge'] = (t, v)
                    G.add_edge(t, v, similarity=r['similarity'])
                results.append(result)

            candidates.append(candidate)
        elif method in ['random', 'homophilous', 'heterophilous', 'ground_truth']:
            if num_samples > 0:
                choice_set = random.sample([v for v in G.nodes() if v != t], num_samples)
            else:
                choice_set = [v for v in G.nodes() if v != t]

            if method == 'random':
                new_nodes = random.sample(choice_set, len(dropped_edges[i]))
            elif method == 'homophilous':
                new_nodes = list(sorted(choice_set, key=lambda v: measure_similarity(profiles[t], profiles[v])['common_attributes'], reverse=True))[:len(dropped_edges[i])]
            elif method == 'heterophilous':
                new_nodes = list(sorted(choice_set, key=lambda v: measure_similarity(profiles[t], profiles[v])['common_attributes']))[:len(dropped_edges[i])]
            elif method == 'ground_truth':
                new_nodes = [e[1] for e in dropped_edges[i]]

            result = []

            for v in new_nodes:
                print(f'Node: {t}, Link: {v}')
                similarity = measure_similarity(profiles[t], profiles[v])
                G.add_edge(t, v, similarity=similarity, weight=similarity['common_attributes'])
            
                result.append({'name' : v, 'similarity' : similarity, 'reason' : method})

            candidate = []

            for v in choice_set:
                similarity = measure_similarity(profiles[t], profiles[v])
                candidate.append({'name' : v, 'similarity' : similarity, 'reason' : method})

            candidates.append(candidate)
            results.append(result)

        Gs.append(G.copy())

    return Gs, results, candidates

def fit_dcm(results):

    similarities = [r['similarity'] for result in results for r in result]
    similarities_df = pd.DataFrame.from_records(similarities)
    similarities_df = sm.add_constant(similarities_df)

    outcomes = np.array([r['edge'][1] for result in results for r in result])

    print(similarities_df)

    mnl_model = sm.MNLogit(outcomes, similarities_df)
    mnl_results = mnl_model.fit()

    print(mnl_results.summary())

    return mnl_results

def measure_similarity(profile1, profile2):
    similarity = {
        'common_attributes' : 0,
        'common_neighbors' : len(set(profile1['neighbors']) & set(profile2['neighbors'])),
        'degree' : profile2['degree'],
    }

    for k in profile1.keys():
        if k != 'name' and k != 'neighbors' and k in profile2.keys():
            if profile1[k] == profile2[k]:
                similarity['common_attributes'] += 1
        
    return similarity


def select_neighbor(G, t, profiles, temperature, num_choices=1, num_samples=-1, dropped_nodes=[]):

    if num_samples > 0:
        choice_set = random.sample([v for v in G.nodes() if v != t and v not in G.neighbors(t)], max(0, num_samples - len(dropped_nodes))) + dropped_nodes
    else:
        choice_set = [v for v in G.nodes() if v != t and v not in G.neighbors(t)]

    candidate_profiles = []

    for v in choice_set + [t]:
        profiles[v]['neighbors'] = list(G.neighbors(v))
        profiles[v]['degree'] = len(profiles[v]['neighbors']) 
        profiles[v]['name'] = v                 
        candidate_profiles.append(profiles[v])

    random.shuffle(candidate_profiles)

    prompt = f"""
    # Task
    Your task is to select a set of people to be friends with.

    # Profile
    Your profile is given below after chevrons:
    <PROFILE>
    {json.dumps(profiles[t])}
    </PROFILE>

    # Candidate Profiles
    The cadidate profiles to be friends with are given below after chevrons:

    <PROFILES>
    {json.dumps(candidate_profiles)}
    </PROFILES>

    # Output
    The output should be given a list of JSON objects with the following structure

    [
        {{
            "name" : name of the person you selected,
            "reason" : reason for selecting the person
        }}, ...
    ]

    # Notes
    - The output must be a list of JSON objects ranked in the order of preference.
    - You can make at most {num_choices} selection{'s' if num_choices > 1 else ''}.
    
    ```json
    """   

    for _ in range(10):
        try:
            ans = get_response(prompt, temperature=temperature)
            results = json.loads(ans.lstrip('```json').rstrip('```'))
            filtered_results = []
            for result in results:
                v = result['name']
                if v in G.nodes():
                    result['similarity'] = measure_similarity(profiles[t], profiles[v])
                    filtered_results.append(result)

                    result['dropped'] = v in dropped_nodes

            print(f'Node: {t}, Links: {filtered_results}')

            candidates = []

            for candidate_profile in candidate_profiles:
                similarity = measure_similarity(profiles[t], candidate_profile)
                candidates.append({'name' : candidate_profile['name'], 'similarity' : similarity})

            return filtered_results, candidates
        except Exception as e:
            print(e)

    return [], []


def run_network_formation_experiment(name, num_egonets, egonets_radius, num_simulations, outfile, temperatures, method, num_choices, num_samples, num_nodes_samples, sample_egonets):
    networks = dataloader.load_facebook100(input_dir='datasets/facebook100', name=name, num_egonets=num_egonets, egonets_radius=egonets_radius, sample_egonets=sample_egonets)
    
    saved_scenarios = set()

    if os.path.exists(outfile):
        with open(outfile) as f:
            lines = f.read().splitlines()

            for line in lines:
                scenario = json.loads(line)
                saved_scenarios.add((scenario['name'], scenario['ego'], scenario['simulation'], scenario['temperature'], scenario['num_samples'], scenario['num_choices']))

        exit()

    f = open(outfile, 'a+')

    for ego, G0 in networks.items():
        for i in range(num_simulations):
            for temperature in temperatures:
                if (name, ego, i, temperature, num_samples, num_choices) in saved_scenarios:
                    print(f'Skipping simulation for name={name}, ego={ego}, i={i}, temperature={temperature}, num_choices={num_choices}, num_samples={num_samples}, method={method}')
                    continue
                else:
                    print(f'Running simulation for name={name}, ego={ego}, i={i}, temperature={temperature}, num_choices={num_choices}, num_samples={num_samples}, method={method}')

                    Gs, results, candidates = network_growth(G0, temperature=temperature, method=method, num_choices=num_choices, num_samples=num_samples, num_nodes_samples=num_nodes_samples)

                    temp = {
                        'name' : name,
                        'ego' : ego,
                        'temperature' : temperature,
                        'simulation' : i,
                        'num_choices' : num_choices,
                        'num_samples' : num_samples,
                        'graphs' : [nx.to_dict_of_dicts(G) for G in Gs],
                        'results' : results,
                        'candidates' : candidates,
                    }    

                    f.write(json.dumps(temp) + '\n')            

                if method != 'llm':
                    break

    f.close()

def draw_graph(G, ax, communities=None, palette=None):

    pos = nx.spring_layout(G)

    netgraph.Graph(G, node_layout=pos, node_color='#d35400', node_size=2.5, edge_color='#34495e', edge_width=1, ax=ax)

    ax.set_axis_off()


def generate_regression_table(filename, outfile):

    palette = ['#d35400', '#34495e', '#2980b9', '#e67e22', '#f1c40f', '#7f8c8d', '#27ae60', '#16a085', '#bdc3c7', '#1abc9c', '#2ecc71', '#3498db', '#9b59b6', '#8e44ad', '#ecf0f1']

    with open(filename) as f:
        lines = f.read().splitlines()

    data = []

    for line in lines:
        data.append(json.loads(line))

    feature_names = ['degree', 'common_attributes', 'common_neighbors']

    regression_table_df = []
    
    names = set([d['name'] for d in data])

    for d in data:
        # Gs = []
        # for graph in d['graphs']:
        #     G = nx.from_dict_of_dicts(graph)
        #     Gs.append(G)

        log_likelihoods = {}

        for num_features in range(len(feature_names) + 1):
            for feature_combination in itertools.combinations(feature_names, num_features):
                feature_combination = list(feature_combination)
                theta, _, log_likelihood, standard_errors, _, _ = fit_discrete_choice_model(d['results'], d['candidates'], feature_names=feature_combination, bias=True)

                temp = {
                    'Name' : d["name"],
                    'Ego' : d["ego"],
                    'Temperature' : d["temperature"],
                    'Simulation' : d["simulation"],
                    'Number of Choices' : d["num_choices"],
                    'Number of Samples' : d["num_samples"],
                    'Independent Variable' : feature_combination,
                    'Coefficients' : theta[:-1].tolist(),
                    'Standard Errors' : standard_errors[:-1].tolist(),
                    'Log Likelihood' : log_likelihood,
                }

                log_likelihoods[tuple(sorted(feature_combination))] = log_likelihood
                p_values = np.array([1 - stats.chi2.cdf(2 * (log_likelihood - log_likelihoods[tuple(sorted(feature_combination[:i] + feature_combination[i + 1:]))]), 1) for i in range(len(feature_combination))])

                temp['P-values'] = p_values.tolist()

                regression_table_df.append(temp)


    regression_table_df = pd.DataFrame.from_records(regression_table_df)

    regression_table_df.to_excel(outfile)

        # fig, ax = plt.subplots(1, 3, figsize=(15, 5))
        # fig.suptitle(f'Individual MNL Model (name = {d["name"]}, ego = {d["ego"]}, temperature = {d["temperature"]})')

        # for i, feat_name in enumerate(feature_names):
        #     theta, relative_probabilities, log_likelihood, choices, choice_sets = fit_discrete_choice_model(d['results'], d['candidates'], feature_names=[feat_name], bias=True)
            
        #     thetas_df.append({
        #         'name' : d["name"],
        #         'ego' : d["ego"],
        #         'temperature' : d["temperature"],
        #         'simulation' : d["simulation"],
        #         'num_choices' : d["num_choices"],
        #         'num_samples' : d["num_samples"],
        #         'independent_variable' : feat_name,
        #         'feat_name' : feat_name,
        #         'theta' : theta[0],
        #     })
            
        #     choices = np.array(choices)

        #     slope, intercept, r, p_value, sterr = scipy.stats.linregress(x=choices[:, 0, 0], y=np.log(relative_probabilities), alternative='two-sided')
        #     sns.regplot(x=choices[:, 0, 0], y=np.log(relative_probabilities), ax=ax[i], ci=95, color=palette[i], scatter_kws={'s' : 10, 'alpha' : 0.5},  label = f'y = {slope:.2f}x + {intercept:.2f}, R2 = {r:.2f}, LL = {log_likelihood:.2f}, p = {p_value:.4f}')
        #     ax[i].set_xlabel(feat_name.replace('_', ' ') + ' (log)')
        #     ax[i].set_ylabel('relative probability (log)')
        #     ax[i].legend(fontsize=8)

        # fig.tight_layout()

        # fig, ax = plt.subplots(1, 3, figsize=(15, 5))
        # fig.suptitle(f'Joint MNL Model (name = {d["name"]}, ego = {d["ego"]}, temperature = {d["temperature"]})')

        # theta, relative_probabilities, log_likelihood, choices, choice_sets = fit_discrete_choice_model(d['results'], d['candidates'], feature_names=feature_names, bias=True)
        # choices = np.array(choices)

        # for i, feat_name in enumerate(feature_names):
            
        #     slope, intercept, r, p_value, sterr = scipy.stats.linregress(x=choices[:, i, 0], y=np.log(relative_probabilities), alternative='two-sided')
        #     sns.regplot(x=choices[:, i, 0], y=np.log(relative_probabilities), ax=ax[i], ci=95, color=palette[i], scatter_kws={'s' : 10, 'alpha' : 0.5},  label = f'y = {slope:.2f}x + {intercept:.2f}, R2 = {r:.2f}, LL = {log_likelihood:.2f}')
        #     ax[i].set_xlabel(feat_name.replace('_', ' ') + ' (log)')
        #     ax[i].set_ylabel('relative probability (log)')
        #     ax[i].legend(fontsize=8)

        #     thetas_df.append({
        #         'name' : d["name"],
        #         'ego' : d["ego"],
        #         'temperature' : d["temperature"],
        #         'simulation' : d["simulation"],
        #         'num_choices' : d["num_choices"],
        #         'num_samples' : d["num_samples"],
        #         'independent_variable' : 'all',
        #         'feat_name' : feat_name,
        #         'theta' : theta[i],
        #     })


        # fig.tight_layout()


    # thetas_df = pd.DataFrame.from_records(thetas_df)


    # fig, ax = plt.subplots(len(names), 3, figsize=(30, 10 * len(names)), squeeze=False)

    # fig.supxlabel('Temperature')

    # for i, name in enumerate(names):
    #     for j, feat_name in enumerate(feature_names):
            
    #         sns.lineplot(data=thetas_df.query(f'feat_name == "{feat_name}"'), x='temperature', y='theta', hue='ego', style='independent_variable', ax=ax[i, j], palette=palette[:3], errorbar=('ci', 95), legend='brief', marker='o', markersize=5, linewidth=1, markeredgewidth=1, markeredgecolor='black')

    #         if i == 0:
    #             ax[i, j].set_title(feat_name.replace('_', ' '))

    #         if j == 0:
    #             ax[i, j].set_ylabel('MNL Coefficient')
    #         else:
    #             ax[i, j].set_ylabel('')

    # fig.tight_layout()

def pretty_print_regression_table(filenames, outfile):

    if isinstance(filenames, str):
        filenames = [filenames]

    regression_table_df = pd.concat([pd.read_excel(filename) for filename in filenames])

    regression_table_df = regression_table_df.query('`Independent Variable` != "[]"')

    table_rows_df = []

    ego_row = True

    for i, row in regression_table_df.iterrows():
        temp = {}
        if row['Ego'] == -1:
            ego_row = False
        else:
            temp['Ego'] = row['Ego']
        temp['Temperature'] = row['Temperature']

        independent_variables = ast.literal_eval(row['Independent Variable'])

        if len(independent_variables) ==  3:

            p_values = ast.literal_eval(row['P-values'])
            coefficients = ast.literal_eval(row['Coefficients'])
            standard_errors = ast.literal_eval(row['Standard Errors'])

            for j, feat_name in enumerate(independent_variables):
                stars = '***' if float(p_values[j]) < 0.001 else '**' if float(p_values[j]) < 0.01 else '*' if float(p_values[j]) < 0.05 else ''
                temp[f'{feat_name}'] = f"{float(coefficients[j]):.2f}{stars} ({float(standard_errors[j]):.2f})"

            temp['Log Likelihood'] = f"{row['Log Likelihood']:.2f}"
            temp['AIC'] = f'{2 * (len(independent_variables) + 1) - 2 * row["Log Likelihood"]:.2f}'
          
            table_rows_df.append(temp)


    table_rows_df = pd.DataFrame.from_records(table_rows_df, columns=['Ego'] if ego_row else [] +  ['Temperature', 'degree', 'common_attributes', 'common_neighbors', 'Log Likelihood', 'AIC'])
    table_rows_df = table_rows_df.fillna(' ')

    table_rows_df.to_latex(outfile, index=False, escape=True, column_format='lcccccc')
            

def prepare_discrete_choice_model(results, candidates, bias=True, feature_names=['degree'], log_transform=True):

    choice_sets = []
    choices = []

    for result in results:
        num_choices = len(result)
        choice = np.ones((len(feature_names) + int(bias), num_choices))
        for i, r in enumerate(result):
            for j, feat_name in enumerate(feature_names):
                if log_transform:
                    choice[j, i] = np.log(r['similarity'][feat_name] + 1)
                else:
                    choice[j, i] = r['similarity'][feat_name]

        choices.append(choice)

    for candidate in candidates:
        choice_set = np.ones((len(feature_names) + int(bias), len(candidate)))

        for i, c in enumerate(candidate):
            for j, feat_name in enumerate(feature_names):
                if log_transform:
                    choice_set[j, i] = np.log(c['similarity'][feat_name] + 1)
                else:
                    choice_set[j, i] = c['similarity'][feat_name]

        choice_sets.append(choice_set)

    return choices, choice_sets


def fit_discrete_choice_model(results, candidates, bias=True, feature_names=['degree', 'common_attributes', 'common_neighbors'], log_transform=True):

    choices, choice_sets = prepare_discrete_choice_model(results, candidates, bias=bias, feature_names=feature_names, log_transform=log_transform)

    theta = np.zeros(len(feature_names) + int(bias))

    ll = lambda x: -discrete_choice_model_log_likelihood(x, choice_sets, choices)

    res = scipy.optimize.minimize(ll, x0=theta, method='L-BFGS-B')

    theta = res.x

    log_likelihood = -res.fun

    standard_errors = res.hess_inv.todense().diagonal() ** 0.5

    relative_probabilities = discrete_choice_model_relative_probability(theta, choice_sets, choices)

    return theta, relative_probabilities, log_likelihood, standard_errors, choices, choice_sets


def discrete_choice_model_log_likelihood(theta, choice_sets, choices):

    log_likelihood = 0

    for choice_set, choice in zip(choice_sets, choices):
        choice_set_utilities = np.dot(theta, choice_set)
        Z = np.sum(np.exp(choice_set_utilities))

        num_choices = choice.shape[1]

        for i in range(num_choices):
            choice_utility = np.dot(theta, choice[:, i])
            log_likelihood += choice_utility - np.log(Z)

    return log_likelihood

def discrete_choice_model_relative_probability(theta, choice_sets, choices):
    probabilities = []
    
    for choice_set, choice in zip(choice_sets, choices):
        choice_set_utilities = np.dot(theta, choice_set)
        Z = np.sum(np.exp(choice_set_utilities))

        num_choices = choice.shape[1]

        for i in range(num_choices):
            choice_utility = np.dot(theta, choice[:, i])
            probabilities.append(np.exp(choice_utility) / Z)

    return np.array(probabilities)

In [2]:
# LLM-based
# run_network_formation_experiment(name='Caltech36', num_egonets=3, egonets_radius=2, num_simulations=1, outfile='outputs/combined_model_facebook100_caltech36.jsonl', temperatures=[0.5, 1.0, 1.5], method='llm', num_choices=1, num_samples=20)
# run_network_formation_experiment(name='Caltech36', num_egonets=-1, egonets_radius=-1, num_simulations=1, sample_egonets=False, num_nodes_samples=3000, outfile='outputs/combined_model_facebook100_caltech36_whole.jsonl', temperatures=[0.5, 1.0, 1.5], method='llm', num_choices=1, num_samples=20)
run_network_formation_experiment(name='Swarthmore42', num_egonets=-1, egonets_radius=-1, num_simulations=1, sample_egonets=False, num_nodes_samples=3000, outfile='outputs/combined_model_facebook100_swarthmore42_whole_3.jsonl', temperatures=[1.5], method='llm', num_choices=1, num_samples=20)

Running simulation for name=Swarthmore42, ego=-1, i=0, temperature=1.5, num_choices=1, num_samples=20, method=llm


since Python 3.9 and will be removed in a subsequent version.
  nodes = random.sample(G.nodes(), min(len(G), num_nodes_samples))


Node: 788, Links: [{'name': 788, 'reason': 'We are both students of the same year and major which would help us relate on academics; also sharing many mutual neighbors indicates that we might have similar social circles and interests making a potential friendship more substantive and significant.', 'similarity': {'common_attributes': 4, 'common_neighbors': 31, 'degree': 31}, 'dropped': False}]
Node: 1552, Links: [{'name': 911, 'reason': 'This person is also a student with a commencement year of 2009, similar to my profile, which may help in creating a relatable friendship. Moreover, we share a common neighbor, 645, which could make for an easier connection. The shared context of being in the same current phase of academics should potentially yield common ground for starting conversations and bonding.', 'similarity': {'common_attributes': 2, 'common_neighbors': 1, 'degree': 43}, 'dropped': False}]
Node: 861, Links: [{'name': 1429, 'reason': 'Sharing the same faculty status, year, both a

In [None]:
generate_regression_table('outputs/combined_model_facebook100_swarthmore42_whole_3.jsonl', 'tables/combined_model_facebook100_swarthmore42_whole_3.xlsx')

In [None]:
# summarize_results('outputs/combined_model_facebook100_caltech36_whole.jsonl')

In [None]:
pretty_print_regression_table('tables/combined_model_facebook100_swarthmore42_whole_3.xlsx', 'tables/combined_model_facebook100_swarthmore42_whole_3.tex')

  table_rows_df.to_latex(outfile, index=False, escape=True, column_format='lcccccc')
