In [63]:
import pandas as pd
# pip install python-Levenshtein for fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re
import numpy as np
from pyvis.network import Network
from pyvis import network as net
import bokeh.io
from pyvis.network import Network
from cytoolz import concat, mapcat
from functools import partial
from itertools import combinations

In [42]:
bokeh.io.output_notebook()


In [3]:
# import the data file you generated & the languages.csv datafile from wals fork
data = pd.read_csv("compiled_data.csv")
# rename the Language column to Name for merging
data = data.rename(columns={'Language': 'Name'})

wals_data = pd.read_csv("languages.csv")


In [4]:
# join by Name column, will populate lang family information if match
merged_df = pd.merge(data, wals_data, on='Name', how='left')


# Estimating typing/spelling errors using Levenshtein distance
- Humans are prone to error and typeos can occur when writing out name. We can use the package, fuzzywuzzy, to try to identify these mistakes. 
- There are types of ratios you can estimate these differences with:


In [5]:
# simple ratio 
fuzz.ratio("this is a test", "this is a test!")


97

In [6]:
# partial ratio 
fuzz.partial_ratio("this is a test", "this is a test!") 

100

Most likely we are going to care about the surname, as there are lots of alternate ways to spell first names (and overall, first names are less identifiable). To do this, we can start by extracting the surname column and make a list of the author names that exist in the merged dataframe we made. 

In [7]:
# return a cleaned list of author names 
r = re.compile("surname")
author_headers = (filter(r.match, data.columns ))
author_list = []
for header in author_headers:
    author_list += data[header].tolist()
    
cleanedList = [x for x in author_list if str(x) != 'nan']

# stylistically clean, but includes duplicates names
len(cleanedList)

273

Now, we can create a function that will iterate through the list of unique names we have. If a unique name is at least 80 percent similar to another name in the list, an alternate spelling will be added to the relevant surname key

In [8]:
def levenshtein(cleanedList):
    unique_names = {} 
    for i in cleanedList:
        if i not in unique_names.keys(): # if it's already in the list, skip it. 
            hold = process.extract(i, unique_names.keys(), limit=5)

            # we have not found a match yet, we're just getting started. 
            found = False
            for k in range(len(hold)):
                if hold[k][1] > 85 and not found:       # also make sure it's not found-- we don't want multiple matches. 
                    unique_names[hold[k][0]].append(i)  # append to the matching location... 
                                                        
                    # we found a match!
                    found = True    
            if not found: 
                # if we didn't find a match, add it to the list. 
                unique_names[i] = [i] 
    return unique_names

authors = levenshtein(cleanedList)


In this case, we can see that setting the threshold to 85 percent captures the "Faghiri/Faghiti" typeo, however, a threshold this low captures authors that were actually two separate ones. In this case, I would probably just recommend manually correcting this one particular entry. 

In [9]:
# manual correcting of typeo
merged_df['surname'] = merged_df['surname'].replace('Faghiti', 'Faghiri')
merged_df

Unnamed: 0,surname,firstname,Institution,Name,social,ID,Macroarea,Latitude,Longitude,Glottocode,...,Family,Subfamily,Genus,GenusIcon,ISO_codes,Samples_100,Samples_200,Country_ID,Source,Parent_ID
0,Sarvasy,Hannah,Western Sydney University,Nungon,(also Alba Tuninetti and other collaborators o...,,,,,,...,,,,,,,,,,
1,Fascinetto-Zago,Karina,Benemérita Universidad Autónoma de Puebla / Le...,Venetian,,,,,,,...,,,,,,,,,,
2,Park,Kihyo,Cornell University,Korean,,kor,Eurasia,37.5,128.0,kore1280,...,Korean,,Korean,,kor,True,True,KR KP,Chang-1996 Cho-1967 Gil-1991 Kim-1972 Kim-1986...,genus-korean
3,Park,Kihyo,Cornell University,Korean,,family-korean,,,,,...,,,,,,,,,,
4,Park,Kihyo,Cornell University,Korean,,genus-korean,,,,,...,,,,cBECC51,,,,,,family-korean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,Wong,Patrick,Chinese University of Hong Kong,,,,,,,,...,,,,,,,,,,
296,Wray,Samantha,Dartmouth,Tagalog,,tag,Papunesia,15.0,121.0,taga1270,...,Austronesian,,Greater Central Philippine,,tgl,True,True,PH,Aldridge-2004 Bloomfield-1917 De-Guzman-1978 E...,genus-greatercentralphilippine
297,Zhang,Miao,,Changsha Xiang,@Miao_Zhang_dr,,,,,,...,,,,,,,,,,
298,Zhao,Haoran,Leiden University,Chinese,,genus-chinese,,,,,...,,,,c33803F,,,,,,family-sinotibetan


In [12]:
# get clean names for 

def get_cleaner_df(df):
    cleaner_df = df.loc[:, ('surname', 'firstname', 'Name', 'Institution', 'Family', 'social')]
    cleaner_df = cleaner_df.where(pd.notnull(cleaner_df), '')
    cleaner_df = df['Researcher'] = df['firstname'] + ' ' + df['surname']

    return cleaner_df

get_cleaner_df(merged_df)

0               Hannah Sarvasy
1      Karina  Fascinetto-Zago
2                   Kihyo Park
3                   Kihyo Park
4                   Kihyo Park
                ...           
295               Patrick Wong
296              Samantha Wray
297                 Miao Zhang
298                Haoran Zhao
299                        NaN
Length: 300, dtype: object

In [15]:
def get_cleaner_df(df):
    # add the first and last name to one cell for node plotting 
    df['Researcjer'] = df['firstname'] + ' ' + df['surname']
    
    # subset only the columns you want 
    subset_df = df.loc[:, ('surname','firstname','Name','Institution', 'Family', 'social', 'Researcher')]
    subset_df = subset_df.where(pd.notnull(subset_df), '')
    return subset_df
    
researchers_clean = get_cleaner_df(merged_df)


In [60]:
subset_df = researchers_clean.rename(columns={"Institution":"weights", 
                                              "Researcher":'sources', 
                                              'Name':"targets"})

subset_df


Unnamed: 0,surname,firstname,targets,weights,Family,social,sources
0,Sarvasy,Hannah,Nungon,Western Sydney University,,(also Alba Tuninetti and other collaborators o...,Hannah Sarvasy
1,Fascinetto-Zago,Karina,Venetian,Benemérita Universidad Autónoma de Puebla / Le...,,,Karina Fascinetto-Zago
2,Park,Kihyo,Korean,Cornell University,Korean,,Kihyo Park
3,Park,Kihyo,Korean,Cornell University,,,Kihyo Park
4,Park,Kihyo,Korean,Cornell University,,,Kihyo Park
...,...,...,...,...,...,...,...
295,Wong,Patrick,,Chinese University of Hong Kong,,,Patrick Wong
296,Wray,Samantha,Tagalog,Dartmouth,Austronesian,,Samantha Wray
297,Zhang,Miao,Changsha Xiang,,,@Miao_Zhang_dr,Miao Zhang
298,Zhao,Haoran,Chinese,Leiden University,,,Haoran Zhao


In [101]:
# need to generate a df with edge information. 
# This will be to create a pairwise count of every language & researcher 
pairwise_subset = subset_df[['sources', 'targets']]


In [105]:
c = lambda x, k: combinations(x, 2)

weights = pd.DataFrame(pd.value_counts(list(concat
    (mapcat(partial(c, x), range(2, pairwise_subset.shape[1] + 1)) for x in pairwise_subset.values.tolist())
                                                 )))

weights

Unnamed: 0,0
"(, )",7
"(Azler Garcia, Basque)",3
"(Kihyo Park, Korean)",3
"(Sebastian Sauppe, Basque)",3
"(Amaia Munarriz, Basque)",3
...,...
"(Indranil Dutta, Bengali)",1
"(Karen Emmory, Sign Language)",1
"(Rabia Ergin, CTSL (village sign language in Turkey))",1
"(Olga Fedorova, Russian)",1


In [106]:
net = Network(bgcolor='white', height="750px", cdn_resources='remote',width="100%", font_color="white", notebook=True, heading='')

sources = subset_df['sources']
targets = subset_df['targets']
weights = weights[0]
edge_data = zip(sources, targets, weights)


for e in edge_data:
                src = e[0]
                dst = e[1]
                w = e[2]

                net.add_node(src, src, title=src)
                net.add_node(dst, dst, title=dst)
                net.add_edge(src, dst, value=w)
                
neigbor_map = net.get_adj_list()

# add neighbor data to node hover data
#for node in net.nodes:
#                node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
#                node["value"] = len(neighbor_map[node["id"]])


net.show("test.html")

NameError: name 'neighbor_map' is not defined