In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame.from_csv("sample_csv_files/sample_users_swarm_input.csv")

In [4]:
for i in range(len(df.columns)):
    print(df.columns[i])

block
cond
team
requested_avatar
age
gender
education
college_major
occupation
team_lead
country_reside
country_reside_years
country_longest
fluent_english
languages_primary
languages_2
languages_3
languages_4
languages_5
languages_6
languages_7
enjoy_logic_probs
enjoy_num_probs
expertise_math
expertise_quant_model
expertise_stats
expertise_prob
expertise_bayes_net
expertise_programming
expertise_exp_design
expertise_risk_analysis
expertise_forecasting
expertise_dec_theory
expertise_game_theory
expertise_sats
expertise_arg_map
expertise_inf_logic
expertise_sys_think
expertise_image_analysis
expertise_link_analysis
expertise_graphic_design
expertise_tech_writing
matrix_1
matrix_2
matrix_3
matrix_4
matrix_5
matrix_6
matrix_7
matrix_8
matrix_9
matrix_10
matrix_11
matrix_12
matrix_13
matrix_14
matrix_15
matrix_16
matrix_17
matrix_18
matrix_19
matrix_20
score_matrix
prob_reas_1
prob_reas_2
prob_reas_3
prob_reas_4
prob_reas_5
prob_reas_6
prob_reas_7
prob_reas_8
prob_reas_9
prob_reas_10
prob_

In [5]:
df.languages_primary.unique()

array(['French', 'Spanish', 'German', 'English'], dtype=object)

# Distance measures

- Mahalanobis-based methods
    - Intro: [Mahalanobis distance](https://en.wikipedia.org/wiki/Mahalanobis_distance) is "a dissimilarity measure between two random vectors x and y of the same distribution"
    - [A generalized Mahalanobis distance for mixed data](https://people.ucalgary.ca/~adeleon/JMVA_mahalanobis.pdf)
        - I don't think this will scale well when there are lots of discrete variables
    - [Distance functions for categorical and mixed variables](https://www-sciencedirect-com.ezp.lib.unimelb.edu.au/science/article/pii/S0167865508000524)
    - [Distance Functions for Categorical and Mixed Variables](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.139.5738&rep=rep1&type=pdf)
    - [Estimating the Mahalanobis Distance from Mixed Continuous and Discrete Data](http://onlinelibrary.wiley.com/doi/10.1111/j.0006-341X.2000.00394.x/abstract)
    - [Generalization of the Mahalanobis Distance in the Mixed Case](http://www.sciencedirect.com/science/article/pii/S0047259X85710408)
- [Chi square and categorial distance (see last few pages)](http://84.89.132.1/~michael/stanford/maeb4.pdf)
- [Improved Heterogeneous Distance Functions](https://arxiv.org/pdf/cs/9701101.pdf)
- [Gower similarity](https://stats.stackexchange.com/questions/15287/hierarchical-clustering-with-mixed-type-data-what-distance-similarity-to-use/15313#15313). See also [this explanation](https://dpmartin42.github.io/blogposts/r/cluster-mixed-types#calculating-distance).
- [Clustering mixed data](http://onlinelibrary.wiley.com.ezp.lib.unimelb.edu.au/doi/10.1002/widm.33/full)
- [Informational distances and related statistics in mixed continuous and categorical variables](https://www-sciencedirect-com.ezp.lib.unimelb.edu.au/science/article/pii/S0378375898001207)
- [Jaccard, simple matching, normalized ranks, etc.](https://stat.ethz.ch/education/semesters/ss2012/ams/slides/v4.2.pdf)

In [6]:
dd = pd.DataFrame.from_csv("CREATE_Ind.Diffs.Pilot_Data.Dictionary.csv")

In [7]:
dd.index.values

array(['age', 'gender', 'education', 'collegeMajor_1_TEXT',
       'collegeMajor_2_TEXT', 'collegeMinor_1_TEXT', 'collegeMinor_2_TEXT',
       'occupation', 'englishProficency', 'otherLangProficiency_1',
       'otherLangProficiency_1_TEXT', 'otherLangProficiency_2',
       'otherLangProficiency_2_TEXT', 'otherLangProficiency_3',
       'otherLangProficiency_3_TEXT', 'otherLangProficiency_4',
       'otherLangProficiency_4_TEXT', 'otherLangProficiency_5',
       'otherLangProficiency_5_TEXT', 'otherLangProficiency_6',
       'otherLangProficiency_6_TEXT', 'enjoy_logicProbs', 'enjoy_numProbs',
       'willing_teamLead', 'expertise_1', 'expertise_2', 'expertise_3',
       'expertise_4', 'expertise_5', 'expertise_6', 'expertise_7',
       'expertise_8', 'expertise_9', 'expertise_10', 'expertise_11',
       'expertise_12', 'expertise_13', 'expertise_14', 'expertise_15',
       'expertise_16', 'expertise_17', 'expertise_18', 'expertise_19',
       'probReas_1', 'probReas_2', 'probReas_3', '

In [8]:
df["enjoy_logic_probs"]

user_id
1001    0
1003    0
1013    0
1024    1
1042    1
1044    0
1048    1
1049    1
1050    1
1059    0
1062    0
1063    1
1065    0
1080    0
1093    1
1097    1
1099    1
1106    0
1111    1
1112    1
1113    0
1120    0
1122    1
1129    0
1130    1
1134    0
1135    1
1136    0
1137    1
1149    1
       ..
4453    0
4462    0
4465    1
4467    1
4474    1
4476    0
4477    1
4478    1
4479    1
4483    0
4484    1
4488    0
4489    0
4493    1
4496    0
4497    0
4498    1
4503    0
4504    1
4505    0
4506    0
4507    1
4508    1
4512    1
4516    1
4518    1
4522    0
4534    0
4546    1
4547    1
Name: enjoy_logic_probs, Length: 780, dtype: int64

In [9]:
df.columns.values

array(['block', 'cond', 'team', 'requested_avatar', 'age', 'gender',
       'education', 'college_major', 'occupation', 'team_lead',
       'country_reside', 'country_reside_years', 'country_longest',
       'fluent_english', 'languages_primary', 'languages_2', 'languages_3',
       'languages_4', 'languages_5', 'languages_6', 'languages_7',
       'enjoy_logic_probs', 'enjoy_num_probs', 'expertise_math',
       'expertise_quant_model', 'expertise_stats', 'expertise_prob',
       'expertise_bayes_net', 'expertise_programming',
       'expertise_exp_design', 'expertise_risk_analysis',
       'expertise_forecasting', 'expertise_dec_theory',
       'expertise_game_theory', 'expertise_sats', 'expertise_arg_map',
       'expertise_inf_logic', 'expertise_sys_think',
       'expertise_image_analysis', 'expertise_link_analysis',
       'expertise_graphic_design', 'expertise_tech_writing', 'matrix_1',
       'matrix_2', 'matrix_3', 'matrix_4', 'matrix_5', 'matrix_6',
       'matrix_7', 'matrix_

In [10]:
import pandas as pd

# create empty df with 700 rows
index = range(700)
newdf = pd.DataFrame(index=index, columns=dd.index.values)

import numpy as np
import random

# sample gender from a multinomial with 49% male, 49% female, 2% other
newdf["gender"] = np.where(np.random.multinomial(1, [.49, .49, .02], (1,700))[0])[1] + 1

# sample age
newdf["age"] = (np.random.binomial(50, .5, (1,700)) + [random.randint(7, 30) for x in range(700)])[0]

In [11]:
# 1 = high school; 2 = some college; 3 = associate's degree; 
# 4 = bachelor's degree; 5 = master's degree; 6 = professional degree or doctorate

# if age < 22, p(3) = 0, p(4)=0, p(5)=0, p(6)=0, p(1)=.4, p(2)=.6


In [12]:
np.max(newdf["age"])

63

In [13]:
newdf.shape

(700, 411)

In [14]:
newdf.max().max()

63.0

In [83]:
newdf[["age", "gender"]]

Unnamed: 0,age,gender
0,49,2
1,50,2
2,44,1
3,40,2
4,52,1
5,45,1
6,50,2
7,36,2
8,51,2
9,39,1


In [15]:

index = range(700)
fakedf = pd.DataFrame(index=index, columns=["X1", "X2", "X3"])
fakedf["X1"] = np.random.randn(700)
fakedf["X1"] = np.random.randn(700)

In [18]:
sigma = [[1., .2, .3], [.2, 1., .5], [.3, .5, 1.]]
          
fakedata = np.random.multivariate_normal([0,0,0], sigma, 700)
fakedf = pd.DataFrame(fakedata)

In [23]:
fakedf

Unnamed: 0,0,1,2
0,0.839400,-1.177166,-0.991556
1,0.811498,1.446566,2.124678
2,-0.068438,-0.714303,-0.998183
3,0.618039,0.161671,-0.959155
4,-0.726518,1.752258,1.040503
5,-1.594221,0.923515,1.913753
6,-1.894745,-0.150506,-0.568040
7,0.613229,1.006807,0.531612
8,-1.824271,-1.644277,-1.578098
9,-0.544829,-0.047636,-0.831043


In [26]:
import sklearn.metrics
test_dist = sklearn.metrics.pairwise.euclidean_distances(fakedf)


In [38]:
# The first line has the following fields separed with spaces:
# M: Integer indicating the number of elements 
# G: Integer indicating the The number of groups
# Group Type: The value can be "ss" or "ds" and represent "same size group" or "different size group"
# Group limits: The last numbers of the line correspond to the lower and upper limits of each group

with open('test_instance.txt', 'a') as the_file:
    the_file.write("700 20 ss 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37 30 37\n")
    for i in range(700):
        for j in range(i+1, 700):
            the_file.write(str(i) + " " + str(j) + " " + str(test_dist[i, j]) + "\n")

In [84]:
df = fakedf

In [None]:
import scipy.spatial

n = df.shape[0]
num_teams = 20
lims = " 30 37"
instance_filename = 'test_instance_manhattan.txt'

# write distances to file
with open(instance_filename, 'a') as the_file:
    the_file.write(str(n) + " " + str(num_teams) + " ss" + str(lims)*num_teams + "\n")
    for i in range(n):
        for j in range(i+1, n):
            mdist = scipy.spatial.distance.cityblock(df.loc[i], df.loc[j])
            the_file.write(str(i) + " " + str(j) + " " + str(mdist) + "\n")

# run solver on instance file, write data to solver file
import subprocess
solver_filename = "test_output_manhattan_2.txt"
bash_command = "java -jar mdgp_jors_2011.jar SO " + instance_filename + " 60000 > " + solver_filename
subprocess.call(bash_command, shell=True)

# read solution from file 
with open(solver_filename) as f:
    content = f.readlines()

# remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]

# identify line where solution is written
sol_index = np.where([item.startswith('Solution: [') for item in content])

# turn into list of ints
solution = [int(x) for x in content[sol_index[0][0]][11:-1].split(", ")]

# add to dataframe of initial data
df["team"] = solution

In [83]:
df

Unnamed: 0,0,1,2,team
0,0.839400,-1.177166,-0.991556,16
1,0.811498,1.446566,2.124678,15
2,-0.068438,-0.714303,-0.998183,10
3,0.618039,0.161671,-0.959155,14
4,-0.726518,1.752258,1.040503,0
5,-1.594221,0.923515,1.913753,6
6,-1.894745,-0.150506,-0.568040,1
7,0.613229,1.006807,0.531612,15
8,-1.824271,-1.644277,-1.578098,5
9,-0.544829,-0.047636,-0.831043,7
