In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as mt
from gensim.models import Word2Vec
from scipy import spatial

model = Word2Vec.load('/Users/dakotamurray/Dropbox/SME-dropbox/Archive/Embeddings_20191209/word2vec/2008-2019_word2vec_d200_ws2_model.bin')
meta = pd.read_csv('/Users/dakotamurray/Dropbox/SME-dropbox/Data/Additional/institution_lookup_with_states.txt', sep = "\t")

meta = meta.fillna(-1)
meta = meta.astype({'cwts_org_no': int})
meta = meta.astype({'cwts_org_no': str})

# Limit the metadata to orgs that are actually in the w2v model vocabulary
vocab = list(model.wv.vocab.keys())
#meta = meta.loc[meta.cwts_org_no in vocab]
meta = meta.loc[meta.cwts_org_no.isin(vocab)]

In [17]:
# first, lets do Canada
# limit meta to orgs actually in our data
canada_meta = meta.loc[meta['country_iso_alpha'] == 'CAN']
canada_meta

quebec_meta = canada_meta.loc[canada_meta['region'] == 'Quebec']
rest_canada_meta = canada_meta.loc[canada_meta['region'] != 'Quebec']
france_meta = meta.loc[meta['country_iso_alpha'] == 'FRA']
usa_meta = meta.loc[meta['country_iso_alpha'] == 'USA']

quebec_vec = np.array([model.wv[x] for x in (quebec_meta.cwts_org_no)]).mean(axis = 0)
canada_vec = np.array([model.wv[x] for x in (canada_meta.cwts_org_no)]).mean(axis = 0)
france_vec = np.array([model.wv[x] for x in (france_meta.cwts_org_no)]).mean(axis = 0)
usa_vec = np.array([model.wv[x] for x in (usa_meta.cwts_org_no)]).mean(axis = 0)

print('Distnace between Quebec and Rest of Canada:', spatial.distance.cosine(quebec_vec, canada_vec))
print('Distance between Quebec and France:', spatial.distance.cosine(quebec_vec, france_vec))
print('Distance between Quebec and USA:', spatial.distance.cosine(quebec_vec, usa_vec))

Distnace between Quebec and Rest of Canada: 0.11811363697052002
Distance between Quebec and France: 0.349692165851593
Distance between Quebec and USA: 0.48817723989486694


In [16]:
print(canada_meta.shape)
print(rest_canada_meta.shape)
print(quebec_meta.shape)

(151, 13)
(97, 13)
(54, 13)


In [20]:
mass = meta.loc[meta['region'] == 'Massachusetts']

#umass = mass.loc['University of Mass'.isin(mass['full_name'])]

# First, get all pairwise distances and save to a file
umass = mass.loc[mass['full_name'].str.contains('University of Mass')]
umass = umass.loc[umass.cwts_org_no != '9814']
boston = mass.loc[mass.city == 'Boston']
boston = mass.loc[mass.cwts_org_no != '9814']


umass_vec = np.array([model.wv[x] for x in (umass.cwts_org_no)]).mean(axis = 0)
boston_vec = np.array([model.wv[x] for x in (boston.cwts_org_no)]).mean(axis = 0)
umassboston_vec = model.wv['9814']

print('Distnace between UMass Boston and Rest UMass System:', spatial.distance.cosine(umassboston_vec, umass_vec))
print('Distnae between UMAss Boston and rest of Boston:', spatial.distance.cosine(umassboston_vec, boston_vec))

Distnace between UMass Boston and Rest UMass System: 0.3310713768005371
Distnae between UMAss Boston and rest of Boston: 0.3796660304069519


In [19]:
boston

Unnamed: 0,cwts_org_no,wos_name,full_name,city,country_iso_num_code,latitude,longitude,org_type_code,org_type,country_iso_name,country_iso_alpha,city_country,region
186,1165,UNIV MASSACHUSETTS - AMHERST,University of Massachusetts Amherst,AMHERST,840.0,42.391100,-72.526700,U,University,United States,USA,AMHERST_USA,Massachusetts
187,18174,AMHERST COLL,Amherst College,AMHERST,840.0,42.370833,-72.516944,E,Teaching Organisation,United States,USA,AMHERST_USA,Massachusetts
647,1974,EDITH NOURSE ROGERS MEM VET HOSP,Edith Nourse Rogers Memorial Veteran Hospital,BEDFORD,840.0,42.505278,-71.270833,H,Hospital,United States,USA,BEDFORD_USA,Massachusetts
805,2057,MCLEAN HOSP,McLean Hospital,BELMONT,840.0,42.393658,-71.191075,H,Hospital,United States,USA,BELMONT_USA,Massachusetts
1046,15912,AERODYNE RES INC,Aerodyne Research Inc.,BILLERICA,840.0,42.528531,-71.268652,R,Research organisation,United States,USA,BILLERICA_USA,Massachusetts
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10138,2242,UNIV MASSACHUSETTS MEM HLTH CARE,University of Massachusetts Memorial Health Care,WORCESTER,840.0,42.276006,-71.762094,HG,Hospital Group,United States,USA,WORCESTER_USA,Massachusetts
10139,2243,UMASS MEM MED CTR,University of Massachusetts Memorial Medical C...,WORCESTER,840.0,42.285753,-71.791358,H,Hospital,United States,USA,WORCESTER_USA,Massachusetts
10140,9834,WORCESTER POLYTECH INST,Worcester Polytechnic Institute,WORCESTER,840.0,42.273489,-71.807350,U,University,United States,USA,WORCESTER_USA,Massachusetts
10142,18201,COLL HOLY CROSS,College of the Holy Cross,WORCESTER,840.0,42.239167,-71.808333,E,Teaching Organisation,United States,USA,WORCESTER_USA,Massachusetts


In [5]:
l2_vals = []
counts = []
countries = list(set(meta.country_iso_alpha))
for country in countries:
    # Get all the orgs per country
    orgs = list(meta.loc[meta.country_iso_alpha == country]['cwts_org_no'])
    # Get the mean vector
    mu_vec = np.array([model.wv[x] for x in orgs]).mean(axis = 0)
    
    l2_vals.append(np.linalg.norm(mu_vec))
    counts.append(len(orgs))
    

In [23]:
# construct a dataframe from the w2v factors
df = pd.DataFrame({
    'country': countries,
    'l2norm': l2_vals,
    'count': counts,
})

In [26]:
df.to_csv("~/Desktop/l2norm_by_country.csv")