In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import sys

In [2]:
data = pd.read_csv('data/sim_data_120_19Apr2020.csv')

data = data.replace('\n','', regex=True)
len(data.index)

120

In [3]:
def inverted_index(data, col_name, val_type):
    """
    Create inverted index based on pandas data
    
    Args:
        data [pd.DataFrame]: expect user tables
        col_name [str]: expect one of the cols in data.columns
        val_type [str]: 'MULT' / 'SINGLE', describe the data structure of a cell
    """
    # Assertion
    assert isinstance(data, pd.DataFrame)
    assert col_name in data.columns
    assert val_type in ['MULT', 'SINGLE']
    
    # MULTIPLE eg, language: ['Spanish' 'Cantonese'], it contains mult. values
    if val_type == 'MULT':
        ########
        # FIXME
        ########        
        # basically this will flatten the list of list in pandas settings.
        # it looks complicated because pandas kind of change the format, fuck it
        # if you have better / clean solution, welcome to fix it
        lst = sum([item.split('\' \'' or '\n ') for item in [item.strip('[]') for item in data[col_name]]], [])
        lst = [item.replace('\'', '') for item in lst]
        
        key_lst = list(Counter(lst).keys())
        result = dict()
        
        for key in key_lst:
            result[key] = set([i for i, val in enumerate([key in row_list for row_list in data[col_name]]) if val])
        
        return result
    
    # SINGLE eg, industry: Management Consulting, it is only single value
    if val_type == 'SINGLE':
        # logic is simpler since it doesn't have any column
        lst = data[col_name]
        
        key_lst = list(Counter(lst).keys())
        result = dict()
        
        for key in key_lst:
            result[key] = set([i for i, val in enumerate([key == row_item for row_item in data[col_name]]) if val])

        return result


In [4]:
def combined_inverted_index(data, ii_input):
    """
    result dict of inverted index
    Args:
        data [pd.DataFrame]: expect pandas.DataFrame
        ii_input [list of tuple]: inverted index input
            for each tuple, 
            tuple[0] is col name in data, 
            tuple[1] is val_type as specified in def:inverted_index
    """
    # Assert the first the first item in tuple belongs to data.columns
    assert np.all([item in data.columns for item in [item[0] for item in ii_input]])
    assert np.all([item in ['SINGLE', 'MULT'] for item in [item[1] for item in ii_input]])
    
    result = dict()
    for item in ii_input:
        result[item[0]] = inverted_index(data, *item)
    return result

ii_input = [
    ('school', 'MULT'),
    ('grad_yr', 'MULT'),
    ('language', 'MULT'),
    ('subject', 'SINGLE'),    
    ('zip', 'SINGLE'),
    ('industry', 'SINGLE'),
    ('title', 'SINGLE'),
    ('country', 'SINGLE'),    
]

# combined_inverted_index(data, ii_input)

In [5]:
combined_inverted_data = combined_inverted_index(data, ii_input)

In [6]:
combined_inverted_data

{'school': {'South China Normal University': {0,
   5,
   13,
   28,
   53,
   56,
   70,
   71,
   76,
   77,
   80,
   87,
   90,
   95,
   102,
   106,
   109},
  'University of California Berkeley': {1,
   9,
   15,
   27,
   40,
   48,
   67,
   82,
   94,
   98,
   100,
   101,
   104},
  'University of Chicago': {2,
   6,
   9,
   12,
   20,
   25,
   34,
   44,
   45,
   57,
   59,
   62,
   63,
   68,
   84,
   88,
   105,
   111,
   116},
  'University of Hong Kong': {3,
   15,
   23,
   24,
   26,
   29,
   31,
   61,
   64,
   69,
   72,
   73,
   77,
   80,
   83,
   84,
   91,
   109,
   110,
   114,
   119},
  'University of Michigan Ann Arbor': {4,
   18,
   55,
   62,
   64,
   107,
   112,
   113,
   116,
   117},
  'Zhejiang University': {5,
   14,
   16,
   17,
   19,
   29,
   33,
   39,
   43,
   51,
   65,
   66,
   86,
   97,
   99},
  'Southwestern University of Finance and Economics': {7,
   8,
   11,
   17,
   23,
   33,
   48,
   53,
   54,
   57,
   71,
   

In [24]:
top_level_group = "school"
min_group_size = 5
max_group_size = 7

top_level_data = combined_inverted_data[top_level_group]
ids_already_in_group = set()

# index with school removed
ii_inputs = [
    ('grad_yr', 'MULT'),
    ('language', 'MULT'),
    ('subject', 'SINGLE'),    
    ('zip', 'SINGLE'),
    ('industry', 'SINGLE'),
    ('title', 'SINGLE'),
    ('country', 'SINGLE'),    
]

# 
groups = []

def add_ids_to_taken(ids):
    ids_already_in_group.update(ids)

def create_group(ids, prefix):
    add_ids_to_taken(ids)
    unique_attributes = ", ".join(prefix)
    print("New group: [" + str(unique_attributes) + "] ids = " + str(ids))
    _dict = {}
    _dict[unique_attributes] = ids
    groups.append(_dict)
    

# Creates a inverted index based on the ids passed here and the ii_input.
def create_new_inverted_index(ids, ii_input):
    try:
        subgroup_df = pd.DataFrame()
        for id in ids:
            subgroup_df = subgroup_df.append(data.iloc[id])

        subgroup_inverted_index = combined_inverted_index(subgroup_df, ii_input)
        return subgroup_inverted_index
    except:
        e = sys.exc_info()[0]
        print("Exception: " + str(e) + "\n" + str(ii_input))
    

def get_values_not_in_group(ids):
    new_ids = []
    for id in ids:
        if id not in ids_already_in_group:
            new_ids.append(id)
    return new_ids

# Get all the inverted indexes in the top level data
for key, values in top_level_data.items():
    
    # For each top level index create secondary inverted indexes and create a group out of ids that are 
    # with in min_group_size and max_group_size
    try:
        for ii_input in ii_inputs:
            print("Generating a new Inverted Index based on %s"%(ii_input[0]))
            current_ids = get_values_not_in_group(values)
            new_inverted_index = create_new_inverted_index(current_ids, [ii_input])
            
            # This parses the new inverted index based on ii_input
            for _, sub_values in new_inverted_index.items():
                
                # This is the 2nd level data which contains the actual attributes like school name, 
                # language name. Parsing that for creating the group.
                for sub_key, sub_ids in sub_values.items():
                    # Figure out all the ids that are still not in a group for this attribute i.e sub_key.
                    get_remaning_subids = get_values_not_in_group(sub_ids)
                    
                    # If the remianings sub ids are within acceptable group size form a group else
                    # break it down into smaller groups
                    if len(get_remaning_subids) > min_group_size and len(get_remaning_subids) < max_group_size:
                        create_group(get_remaning_subids, [key, sub_key])
                    elif len(get_remaning_subids) > max_group_size:
                        while len(get_remaning_subids) < min_group_size:
                            create_group(get_remaning_subids[0:min_group_size], [key, sub_key])
                            get_remaning_subids = get_remaning_subids[min_group_size:]
                    else:
                        print("Not enough ids to form a group %s, %s; ids count left: %d"%(key, sub_key, len(get_remaning_subids)))

        
        # exhausted all ii_inputs, check how many keys are left. 
        # These will form a group with just top level group
        current_ids = get_values_not_in_group(values)
        while len(current_ids) > min_group_size:
            create_group(current_ids[0:min_group_size], [key])
            current_ids = current_ids[min_group_size:]

        # corner case when there are some ids left less then min_group_size.
        # just form a group with them.
        create_group(current_ids[0:min_group_size], [key])
        
        # Ensure there aren't any more ids.
        current_ids = get_values_not_in_group(values)
        assert len(current_ids) is 0
    except:
        e = sys.exc_info()[0]
        print("Exception in top level parsing: " + str(e))


Generating a new Inverted Index based on grad_yr
New group: [South China Normal University, 2020] ids = [0, 5, 6, 13, 14, 15]
Not enough ids to form a group South China Normal University, 2018 2019; ids count left: 1
Not enough ids to form a group South China Normal University, 2011; ids count left: 1
Not enough ids to form a group South China Normal University, 2016 2018; ids count left: 1
Not enough ids to form a group South China Normal University, 2017; ids count left: 1
Not enough ids to form a group South China Normal University, 2017 2020; ids count left: 0
Not enough ids to form a group South China Normal University, 2011 2020; ids count left: 0
Not enough ids to form a group South China Normal University, 2015; ids count left: 4
Not enough ids to form a group South China Normal University, 2015 2010; ids count left: 1
Not enough ids to form a group South China Normal University, 2012 2015; ids count left: 1
Not enough ids to form a group South China Normal University, 2016 201

Not enough ids to form a group University of California Berkeley, Japan; ids count left: 0
Not enough ids to form a group University of California Berkeley, UK; ids count left: 0
Not enough ids to form a group University of California Berkeley, France; ids count left: 0
Not enough ids to form a group University of California Berkeley, India; ids count left: 3
Not enough ids to form a group University of California Berkeley, United States; ids count left: 1
New group: [University of California Berkeley] ids = [98, 67, 100, 101, 40]
New group: [University of California Berkeley] ids = [9, 104, 48, 82, 27]
New group: [University of California Berkeley] ids = [94]
Generating a new Inverted Index based on grad_yr
Not enough ids to form a group University of Chicago, 2011 2012; ids count left: 0
Not enough ids to form a group University of Chicago, 2018 2020; ids count left: 0
Not enough ids to form a group University of Chicago, 2018 2012; ids count left: 0
Not enough ids to form a group Un

Not enough ids to form a group University of Hong Kong, 94104.0; ids count left: 0
Not enough ids to form a group University of Hong Kong, 94105.0; ids count left: 0
Not enough ids to form a group University of Hong Kong, 94110.0; ids count left: 1
Not enough ids to form a group University of Hong Kong, 94107.0; ids count left: 0
Not enough ids to form a group University of Hong Kong, 94103.0; ids count left: 0
Not enough ids to form a group University of Hong Kong, 94101.0; ids count left: 1
Not enough ids to form a group University of Hong Kong, 94109.0; ids count left: 1
Not enough ids to form a group University of Hong Kong, 94102.0; ids count left: 0
Not enough ids to form a group University of Hong Kong, 94108.0; ids count left: 0
Generating a new Inverted Index based on industry
Not enough ids to form a group University of Hong Kong, Computer Software; ids count left: 0
Not enough ids to form a group University of Hong Kong, Accounting; ids count left: 0
Not enough ids to form a

Not enough ids to form a group Zhejiang University, Higher Education; ids count left: 0
Not enough ids to form a group Zhejiang University, Banking; ids count left: 1
Not enough ids to form a group Zhejiang University, Management Consulting; ids count left: 1
Not enough ids to form a group Zhejiang University, Financial Services; ids count left: 0
Not enough ids to form a group Zhejiang University, Accounting; ids count left: 0
Not enough ids to form a group Zhejiang University, Design; ids count left: 0
Not enough ids to form a group Zhejiang University, Investment Banking/Venture; ids count left: 0
Generating a new Inverted Index based on title
Not enough ids to form a group Zhejiang University, Data Scientist; ids count left: 0
Not enough ids to form a group Zhejiang University, Researcher; ids count left: 0
Not enough ids to form a group Zhejiang University, Product Manager; ids count left: 0
Not enough ids to form a group Zhejiang University, UX Designer; ids count left: 1
Not eno

Not enough ids to form a group Stanford University, 94104.0; ids count left: 0
Not enough ids to form a group Stanford University, 94107.0; ids count left: 0
Not enough ids to form a group Stanford University, 94110.0; ids count left: 0
Not enough ids to form a group Stanford University, 94103.0; ids count left: 0
Generating a new Inverted Index based on industry
Not enough ids to form a group Stanford University, Accounting; ids count left: 0
Not enough ids to form a group Stanford University, Design; ids count left: 0
Not enough ids to form a group Stanford University, Financial Services; ids count left: 0
Not enough ids to form a group Stanford University, Computer Software; ids count left: 0
Not enough ids to form a group Stanford University, Investment Banking/Venture; ids count left: 0
Generating a new Inverted Index based on title
Not enough ids to form a group Stanford University, Product Manager; ids count left: 0
Not enough ids to form a group Stanford University, Software En

Not enough ids to form a group University of California Los Angeles, United States; ids count left: 0
New group: [University of California Los Angeles] ids = [108, 47, 49, 115, 52]
New group: [University of California Los Angeles] ids = [89]


In [25]:
groups

[{'South China Normal University, 2020': [0, 5, 6, 13, 14, 15]},
 {'South China Normal University, Spanish': [1, 2, 4, 10, 11, 12]},
 {'South China Normal University': [70, 71, 102, 106, 76]},
 {'South China Normal University': [77, 109, 80, 53, 87]},
 {'South China Normal University': [56, 90, 28, 95]},
 {'University of California Berkeley': [98, 67, 100, 101, 40]},
 {'University of California Berkeley': [9, 104, 48, 82, 27]},
 {'University of California Berkeley': [94]},
 {'University of Chicago': [34, 68, 105, 44, 45]},
 {'University of Chicago': [111, 20, 84, 116, 88]},
 {'University of Chicago': [25, 59, 63, 62, 57]},
 {'University of Hong Kong': [3, 23, 24, 26, 29]},
 {'University of Hong Kong': [31, 61, 64, 69, 72]},
 {'University of Hong Kong': [73, 83, 91, 110, 114]},
 {'University of Hong Kong': [119]},
 {'University of Michigan Ann Arbor': [107, 112, 113, 18, 117]},
 {'University of Michigan Ann Arbor': [55]},
 {'Zhejiang University': [33, 65, 66, 97, 99]},
 {'Zhejiang Unive