This notebook takes the raw author strings and parses them into first names, initials and last names

And saves the results to `author_string_names.csv`

The function used are found in `authorUtils.py` and the regex in `authorRe.py`

In [1]:
import psycopg2
import pandas as pd

import unicodedata

import re

from authorRe import dmatch
import authorUtils as util
import sqlUtils as q

from pprint import pprint  # pretty-printery

from IPython.display import clear_output 

# import prefix list and known error list

In [2]:
prefix_list = list(pd.read_csv('name_prefixes.csv', header=None, encoding = 'ISO-8859-1')[0]) 
known_error_list = list(pd.read_csv('error_match.csv', header=None, encoding = 'ISO-8859-1')[0]) 

# split one raw name

In [3]:
raw_name = 'van mala'
norm = util.normalise_name(raw_name)
intuple = (norm, raw_name, None)
#print(intuple)
outtuple = util.split_name(intuple[0], intuple[1], intuple[2], prefix_list, known_error_list)
print([intuple, outtuple])

[('van mala', 'van mala', None), ('van', 'v', 'mala', 'only_names')]


# Query the database

In [4]:
## query for name, author_string_id, paper_id, journal_id, journal_name

In [5]:
query = """
SELECT author.name, author.id, author_paper.paper_id, paper.journal_id, journal.name as journal_name
FROM author 
INNER JOIN author_paper ON (author.id = author_paper.author_id)
INNER JOIN paper ON (author_paper.paper_id = paper.id)
INNER JOIN journal ON (paper.journal_id = journal.id)
"""

df = q.query(query, 'sqlConfig.json')

In [6]:

total_rows = len(df)
df.head()


Unnamed: 0,name,id,paper_id,journal_id,journal_name
0,Jun Li,105,18,1,Journal of the American Chemical Society
1,Feng-Ming Zhang,306,55,1,Journal of the American Chemical Society
2,Ya-Qian Lan,313,55,1,Journal of the American Chemical Society
3,Zhonglin Wei,522,95,2,Journal of Organic Chemistry
4,Changhui Lu,1479,264,8,Chemical Science


# Rename author string id

In [None]:
df.rename(index=str, columns={'id':'author_string_id'}, inplace=True)

# Normalise the names

In [None]:
df = util.normalise_df_names(df, 'name')
df.head()

# Test the journals to get typical journal format

In [None]:
journal_formats_df = util.get_journal_formats_df(df, dmatch)

In [None]:
print('{} journals'.format(len(journal_formats_df)))
print()
print(journal_formats_df.groupby('journal_format').journal_id.count())



# Parse all names

In [None]:
prefix_list = list(pd.read_csv('name_prefixes.csv', header=None, encoding = 'ISO-8859-1')[0]) 
known_error_list = list(pd.read_csv('error_match.csv', header=None, encoding = 'ISO-8859-1')[0]) 

In [None]:
as_df = df.drop_duplicates('author_string_id').reset_index(drop=True)


### save the parsed names to `author_string_names.csv`

In [None]:
def split_names_to_csv(df):
    
    header = True
    
    total = len(df)
    
    res_df = pd.DataFrame()

    for index in df.index:
        
        res = {}
        
        res['author_string_id'] = df.at[index, 'author_string_id']
        res['journal_id'] = df.at[index, 'journal_id']
        res['paper_id'] = df.at[index, 'paper_id']
    
        norm = as_df.at[index, 'norm']
        raw = as_df.at[index, 'raw_name']
        journal_format = None
        
        res['norm_name'] = norm
        res['raw_name'] = raw
        
        
        res['forenames'], res['initials'], res['last_names'], res['journal_format'] = util.split_name(norm, raw, journal_format, prefix_list, known_error_list)
        
        try:
            res['first_initial'] = res['initials'][0]
        except:
            res['first_initial'] = 'unknown'
            
        try:
            res['last_name'] = res['last_names'].split()[-1]
        except:
            res['last_name'] = 'unknown'
     
        # save to df
        res_df = res_df.append(res, ignore_index=True)
        
        # save the results to a csv file    
        if (index % 1000) == 0:
            clear_output(wait=True)
            print('row {:,} of {:,}'.format(index, total))

            if header:
                res_df.to_csv('author_string_names.csv', index=False)
            else:
                res_df.to_csv('author_string_names.csv', index=False, mode='a', header=False)

            res_df = pd.DataFrame()
            header = False
        

    print('done')
    return res_df
    
res_df = split_names_to_csv(as_df)
res_df.head()

# old code ignore

In [None]:


# from IPython.display import clear_output     


# #get the name prefixes
# prefixes = list(pd.read_csv('name_prefixes.csv', header=None, encoding = 'ISO-8859-1')[0]) 
# error_match = list(pd.read_csv('error_match.csv', header=None, encoding = 'ISO-8859-1')[0]) 


# count = 0
# unknown_count = 0
# error_count = 0
# total = len(df)

# res_df = pd.DataFrame()

# header=True

# for index, row in df.iterrows(): 
    
   
#     # look up name format
#     journal_format = list(journal_formats_df[journal_formats_df.journal_id==row['journal_id']].journal_format)[0]
    
#     # initate the parse object with name, regex dictionary and journal format    
#     parsed = ParseName({
#         'raw_name': row['name'], 
#         'dmatch': dmatch,
#         'journal_format': journal_format,
        
#     }, prefixes, error_match)
    
#     # get the parsed name object and add the journal id and paper id
#     res = parsed.name
#     res['journal_id'] = row['journal_id']
#     res['paper_id'] = row['paper_id']
#     res['author_string_id'] = row['id']
    
    
#     # save to df
#     res_df = res_df.append(res, ignore_index=True)
    
    
#     # count the results
#     count += 1
#     if parsed.name['format'] == 'unknown':
#         unknown_count +=1
#     elif parsed.name['format'] == 'error':
#         error_count +=1 
#     else:
#         pass

#     # save the results to a csv file    
#     if (count % 1000) == 0:
#         clear_output(wait=True)
#         print('row {:,} of {:,}'.format(count, total))
#         print('unknown {:.3f}%, error {:.3f}%'.format(unknown_count/count*100., error_count/count*100.))
        
#         if header:
#             res_df.to_csv('parsed_names.csv', index=False)
#         else:
#             res_df.to_csv('parsed_names.csv', index=False, mode='a', header=False)
        
#         res_df = pd.DataFrame()
#         header = False

# print(res_df.head())
# print('Done')

In [None]:
# """
# SELECT author.name, x.paper_id, y.journal_id, journal.name as journal_name
# FROM author 
# INNER JOIN (
# 	SELECT *
# 	FROM (
# 		SELECT ROW_NUMBER() OVER (PARTITION BY author_paper.author_id ORDER BY author_paper.author_id) AS r, author_paper.*
# 		FROM author_paper
# 	) AS t1
# 	WHERE t1.r < 3
# ) AS x ON (author.id = x.author_id)
# INNER JOIN (	
# 	SELECT *
# 	FROM (
# 		SELECT ROW_NUMBER() OVER (PARTITION BY paper.journal_id ORDER BY paper.journal_id) AS r, paper.journal_id, paper.id
# 		FROM paper 
# 	) AS t2
# 	WHERE t2.r < 10	
# ) AS y ON (x.paper_id = y.id)
# INNER JOIN journal ON y.journal_id = journal.id
# ORDER BY y.journal_id  
# """

In [None]:
# class TestRe():
    
    
#     def test(self, test_name, re_string, pass_list, fail_list):
        
#         print(test_name)
#         print(re_string)
        
#         pass_flag = True
        
#         # pass 
#         for s in pass_list:
#             res = re.search(re_string, s)
#             #print(s, res)
#             if res == None:
#                 print('{} failed on pass {}'.format(test_name, s))
#                 pass_flag = False
#             else: 
#                 print('{} passed match "{}" on "{}"'.format(test_name, s, res.group(0)))
        
#         # fail
#         for s in fail_list:
#             res = re.search(re_string, s)
#             #print(s, res)
#             if res != None:
#                 print('{} failed on fail {}'.format(test_name, s))
#                 pass_flag = False
        
#         if pass_flag: print (test_name + ' PASS')
#         print()
                
     
#     def test_names_comma(self, re_string):
        
#         # should pass if lower case names followed by a comma space        
#         test_name = 'names_comma'
#         pass_list = ['abc, def', 'abc def, ', 'abc def ghi, ', "a'bc, ", 'abc-def abc, ']
#         fail_list = ['a, ', 'abc abc', 'ABC, ', 'Abc, ', 'a']
#         self.test(test_name, re_string, pass_list, fail_list)
        
#     def test_initials_no_space(self, re_string):
        
#         # should pass if block of 2-4 capital letters
#         test_name = 'initials_no_space'
#         pass_list = ['ABCDE, ABCDE AB', 'ABCDE, ABC', "A'BCDE ABC", "ABC A'BCDE", 'AB Abc']
#         fail_list = ['abcde ab', 'Abcde ab', 'ABCDE ABCDE']
#         self.test(test_name, re_string, pass_list, fail_list)
        
        
#     def test_no_lower_case(self, re_string):
        
#         # should pass if only capitals
#         test_name = 'no_lower_case'
#         pass_list = ['ABCDE, ABCDE AB', 'ABCDE, ABC', "A'BCDE ABC", "ABC A'BCDE"]
#         fail_list = ['abcde ab', 'Abcde ab']
#         self.test(test_name, re_string, pass_list, fail_list)
  

#     def test_only_names(self, re_string):
        
#         # should pass if only lower case words which are 2 or more characters long
#         test_name = 'only_names'
#         pass_list = ['abc', 'abc def', "a'bc", 'abc-def']
#         fail_list = ['ABC DEF', 'abc, def', 'abc d ef']
#         self.test(test_name, re_string, pass_list, fail_list)

        
#     def test_only_initials(self, re_string):
#         # should only pass if all single lower case letters with spaces with no spaces before or after
#         test_name = 'only_initials'
#         pass_list = ['a', 'a b c']
#         fail_list = ['abc', 'abc a', 'a abc', "a'a" , ' a a' , 'a a ']
#         self.test(test_name, re_string, pass_list, fail_list)
        
        
        
#     def test_names_initials(self, re_string):
#         # should only pass lower case names followed by lower case spaced initials
#         test_name = 'names_initials'
#         pass_list = ['abc a', 'abc abc a a', "a'abc a"]
#         fail_list = ['a abc', 'a a', 'abc abc', 'abc a abc', 'a abc a', 'abc', 'a']
#         self.test(test_name, re_string, pass_list, fail_list)        
  

#     def test_names(self, re_string):
        
#         # should pass if one or more lower case names
#         test_name = 'names'
#         pass_list = ['abc', 'abc def', "a'bc", 'abc-def', 'a abc', 'abc a', 'abc, def', 'abc d ef']
#         fail_list = ['ABC DEF', 'a', 'a a']
#         self.test(test_name, re_string, pass_list, fail_list)    
        
#     def test_initials_names(self, re_string):
        
#         # should pass if one or more initials followed by one or more names - lower case
#         test_name = 'initials_names'
#         pass_list = ['a abc', 'a b c abc', 'a abc abc']
#         fail_list = ['ab abc', 'abc a abc', 'abc a']
#         self.test(test_name, re_string, pass_list, fail_list) 
  

#     def test_names_somethings_name(self, re_string):
        
#         # should pass if one or more names followed by zero or more names or initials followed by one or more names - lower case
#         test_name = 'names_somethings_name'
#         pass_list = ['abc abc', 'abc abc abc', 'abc abc a abc abc', 'abc ab-cd abc']
#         fail_list = ['a abc', 'a abc abc', 'abc a']
#         self.test(test_name, re_string, pass_list, fail_list) 
        
        
#     def test_no_spaces(self, re_string):
        
#         # should pass if there is a lower case string of 1 or more characters with no spaces
#         test_name = 'no_spaces'
#         pass_list = ['a', 'ab']
#         fail_list = ['a b', '', ' ', ' a', 'a ']
#         self.test(test_name, re_string, pass_list, fail_list) 
        
#     def test_blank(self, re_string):
#         # should pass if there is nothing
#         test_name = 'blank'
#         pass_list = ['', ' ']
#         fail_list = ['a', 'a b']
#         self.test(test_name, re_string, pass_list, fail_list) 
        
#     def test_names_initials_names(self, re_string):
#         # should pass there are one or more names followed by one or more initials followed by one or more names - lower case
#         test_name = 'names_intitials_names'
#         pass_list = ['abc a abc', 'abc abc a abc abc', "abc a a'bc"]
#         fail_list = ['a abc', 'abc abc']
#         self.test(test_name, re_string, pass_list, fail_list)   
        
#     def test_illegal(self, re_string):
#         # should pass if contains illegal characters ie not a-z ' -
#         test_name = 'illegal'
#         pass_list = ['#', 'yfh&amp', 'afaf (dff)']
#         fail_list = ['abc', 'abc, a']
#         self.test(test_name, re_string, pass_list, fail_list)   
        
#     def test_names_comma_names(self, re_string):
#         # should pass if names followed by comma followed by names
#         test_name = 'names_comma_names'
#         pass_list = ['abc, abc', 'abc abc, abc abc']
#         fail_list = ['abc', 'abc, a', 'abc, abc, abc']
#         self.test(test_name, re_string, pass_list, fail_list)     
        
#     def test_names_comma_initials(self, re_string):
#         # should pass if names followed by comma followed by names
#         test_name = 'names_comma_initials'
#         pass_list = ['abc, a', 'abc abc, a a']
#         fail_list = ['abc', 'abc, abc a', 'abc, a abc']
#         self.test(test_name, re_string, pass_list, fail_list)  
        
#     def test_names_comma_names_initials(self, re_string):
#         # should pass if names followed by comma followed by names
#         test_name = 'names_comma_names_initials'
#         pass_list = ['abc, abc a', 'abc abc, abc abc a a']
#         fail_list = ['abc', 'abc, a abc a', 'abc, a abc']
#         self.test(test_name, re_string, pass_list, fail_list)          
        
#     def test_names_comma_initials_names(self, re_string):
#         # should pass if names followed by comma followed by names
#         test_name = 'names_comma_initials_names'
#         pass_list = ['abc, a abc', 'abc abc, a a abc abc']
#         fail_list = ['abc abc', 'abc, abc a', 'abc a abc']
#         self.test(test_name, re_string, pass_list, fail_list) 
        
        
# testRe = TestRe()


# testRe.test_names_comma_initials_names(dmatch['names_comma_initials_names'])
# testRe.test_names_comma_names_initials(dmatch['names_comma_names_initials'])
# testRe.test_names_comma_initials(dmatch['names_comma_initials'])
# testRe.test_names_comma_names(dmatch['names_comma_names'])
# testRe.test_illegal(dmatch['illegal'])
# testRe.test_names_initials_names(dmatch['names_initials_names'])
# testRe.test_blank(dmatch['blank'])
# testRe.test_no_spaces(dmatch['no_spaces'])
# testRe.test_names_somethings_name(dmatch['names_somethings_name'])
# # testRe.test_initials_names(dmatch['initials_names'])
# testRe.test_names(dmatch['names'])
# testRe.test_names_initials(dmatch['names_initials'])
# testRe.test_only_initials(dmatch['only_initials'])
# testRe.test_only_names(dmatch['only_names'])
# testRe.test_names_comma(dmatch['names_comma'])
# testRe.test_initials_no_space(dmatch['initials_no_space'])  
# testRe.test_no_lower_case(dmatch['no_lower_case'])


In [None]:
# import re

# # regex components

# something = "[a-z\s'-]"
# no_space = "[a-z'-]"

# illegal = "[^a-z,\s'-]"

# letter = "[a-z]"      

# word = "[a-z'-]{2,}"    

# # lower case initial

# somethings = '{}*'.format(something)

# initials1 = "({}\s)+".format(letter)
# initials = "({}\s)*".format(letter)
# end_initial = letter
#                              # lower case words including ' and -

# last_name_comma = "{},\s".format(word)

# names1 = "({}\s)+".format(word)
# names = "({}\s)*".format(word)
# end_name = word

# dmatch = {

#     'names_comma'                        : '^{}{}'.format(names, last_name_comma), # only for journal check
#     'names_comma_names'                  : '^{}{}{}{}$'.format(names, last_name_comma, names, end_name), 
#     'names_comma_initials'               : '^{}{}{}{}$'.format(names, last_name_comma, initials, end_initial),
#     'names_comma_names_initials'         : '^{}{}{}{}{}$'.format(names, last_name_comma, names1, initials, end_initial),
#     'names_comma_initials_names'         : '^{}{}{}{}(){}$'.format(names, last_name_comma, initials1, names, end_name),
    
#     'names'                              : '.*({}(?=(\s|$|,)))+'.format(word),
#     'only_names'                         : '^{}{}$'.format(names, end_name),
#     'only_initials'                      : '^{}{}$'.format(initials, end_initial),
#     'names_initials'                     : '^{}{}\s{}{}$'.format(names, end_name, initials, end_initial),
    
#     'initials_names'                     : '^{}{}{}$'.format(initials1, names, end_name),
    
#     'names_somethings_name'              : '^{}{}{}$'.format(names1, somethings, end_name), # only for journal check
#     'names_initials_names'               : '^{}{}{}{}$'.format(names1, initials1, names, end_name),
    
    
#     'initials_no_space'                  : '((?<=\s)[A-Z]{2,4}(?=($|\s))|(^[A-Z]{2,4}(?=\s)))', # only for journal check
#     'no_lower_case'                      : '^[^a-z]{3,}$',      # only for journal check,
    
#     'no_spaces'                          : '^{}+$'.format(no_space),
#     'blank'                              : '^\s*$',  
#     'illegal'                            : '^.*{}+.*$'.format(illegal),

# }

In [None]:
# def strip_accents(s):
#     return ''.join(c for c in unicodedata.normalize('NFD', s)
#                   if unicodedata.category(c) != 'Mn')

# def normalise_names(df):
    
#     df['raw_name'] = df.name # keep original name string
    
#     df.name = df.name.apply(lambda x: strip_accents(x))  # remove diacritics
#     df.name = df.name.str.lower()                        # make lower case
    
#     df.name = df.name.str.replace('.', ' ')              # 1) remove dots from initials ? should this only be done where character . space
#     df.name = df.name.str.replace('  ', ' ')             # 2) remove double spaces    
#     df.name = df.name.str.strip()                        # 3) strip white spacesname
    
#     return df

In [None]:


# def __get_name_format(norm, raw):    
        
#         # check for errors
        
#         if re.match(dmatch['blank'], raw): 
#             return 'error.blank'
#         elif re.match(dmatch['no_spaces'], norm): 
#             return 'error.no_spaces'
#         elif re.match(dmatch['illegal'], norm): 
#             return 'error.illegal_character'
#                 ## only initials
#         elif re.match(dmatch['only_initials'], norm): 
#             return  'error.only_initials'
#         elif len(norm) > 100: ## todo check good length
#             return 'error.too_long' 
#         elif norm in known_error_list:
#             return 'error.list'
        
#         ## todo?? double comma???
        
#         # difficult formats
        
#         elif re.match(dmatch['names_comma_initials_names'], norm):
#             return 'unknown.names_comma_initials_names'        
        
#         # check known formats
        
#         else:
#             name_formats = [
#                 'names_initials_names',
#                 'initials_names',
#                 'only_names',
# #                 '_initials_names',
#                 'names_initials',
                
#                 'names_comma_names',
#                 'names_comma_names_initials',
                
                
#             ]
            
#             for name_format in name_formats:                
#                 if re.match(dmatch[name_format], norm): return name_format
        
        
        
#         # if none of the above match
#         return 'error.unknown_format'

    
# def __find_prefix(a):
#     # returns index of first recognised prefix else return -1 
#     i = 0
#     for n in a:
#         if n in prefix_list:
#             return i
#         i+=1
#     return -1
    
# def __get_initials(forenames):
#     return ' '.join([n[0] for n in forenames.split()])
        
# def __parse_name(norm, raw, name_format):
    
#     forenames = 'unknown'
#     initials = 'unknown'
#     last_names = 'unknown'

#     if name_format == 'names_initials_names':
#         # get the initials and then parts before and after
#         initials = ' '.join(re.findall(dmatch['_initial_'], norm))
#         forenames = re.match("^.*(?=\s{})".format(initials), norm).group(0)
#         last_names = norm[len(initials + forenames) + 2:]
#         initials = __get_initials(forenames) + ' ' + initials
    
#     elif name_format == 'initials_names':
#         # get the initials and then parts before and after
#         initials = re.match(dmatch['initials_'], norm).group(0).strip()
#         last_names = norm[len(initials) + 1:]
        
#     elif name_format == 'only_names':
        
#         a = norm.split()
#         i = __find_prefix(a)
#         forenames = ' '.join(a[:i])
#         initials = __get_initials(forenames)
#         last_names = ' '.join(a[i:])
        
#     elif name_format == 'names_initials':
        
#         last_names = re.match(dmatch['names_'], norm).group(0)
#         initials = norm[len(last_names) + 1:]
        
    
#     elif name_format == 'names_comma_names':
#         # get the parts before and after the comma
#         a = norm.split(',')
#         forenames = a[1].strip()
#         initials = __get_initials(forenames)
#         last_names = a[0].strip()
        
#     elif name_format == 'names_comma_names_initials':        
#         #reveres the order and call the function with the new name_format
#         a = norm.split(',')
#         norm = a[1] + ' ' + a[0]
#         forenames, initials, last_names = __parse_name(norm, None, 'names_initials_names')
        
#     elif name_format == 'names_comma_initials_names':
#         print(norm)
    
#     return forenames, initials, last_names
    

# def split_name(norm, raw, journal_format=None):
    
#     #print(norm, raw, journal_format)
    
#     forenames = 'unknown'
#     initials = 'unknown'
#     last_names = 'unknown'
#     name_format = 'unknown'
    
#     name_format = __get_name_format(norm, raw)
    
#     # if an error return 
#     if re.match('error', name_format):
#         #print(name_format)
#         return forenames, initials, last_names, name_format
    
#     #print(name_format)
#     forenames, initials, last_names  = __parse_name(norm, raw, name_format)  
    
#     return forenames, initials, last_names, name_format

# raw_name = 'def, a abc'
# norm = util.normalise_name(raw_name)
# intuple = (norm, raw_name, None)
# #print(intuple)
# outtuple = split_name(intuple[0], intuple[1], intuple[2])
# print([intuple, outtuple])

In [None]:
# class ParseName():
    
#     def __init__(self, config, prefixes, error_match):

        
#         self.error_match_list = error_match

#         self.dmatch = config['dmatch']
        
#         self.name = {
#             'raw':  config['raw_name'], 
#             'norm': self.normalise_name(config['raw_name']),
#             'format': None,
#             'forenames': None,
#             'forenames_array': [], 
#             'initials': None,
#             'initials_array': [],
#             'last_names': None,
#             'last_names_array': [],
#             'error': None
#         }
        
#         self.journal_format = config['journal_format']
        
#         # split out initials that are presented in a block # TODO male more general        
#         if self.journal_format == '--names-initials_no_space':
#             self.split_initials_block(self.name['raw'], self.name['norm']) # TODO - format should be name_format suffix
        
        
#         # decode &amp characters TODO
        
        
#         self.get_name_format()      
        
#         self.parse()
#         self.get_component_strings()
                

#     def parse(self):
            
#         if self.name['format'] == 'last_names_comma': 
#             self.parse_last_names_comma_format()
#             return
            
#         if self.name['format'] == 'initials_names':
#             self.parse_initials_name_format()
#             return
            
#         if self.name['format'] == 'names_initials_names':
#             self.parse_names_initials_name_format()  
#             return
            
#         if self.name['format'] == 'names':
#             self.parse_names_format()
#             return
        
#     def get_component_strings(self):
        
#         self.name['forenames'] = ' '.join(self.name['forenames_array'])
#         self.name['forenames'] = self.name['forenames'].replace('  ', ' ').strip()
        
#         self.name['initials'] = ' '.join(self.name['initials_array'])
            
            
#     def get_name_format(self):
        
#         # identify errors
        
# #         ## blanks
# #         if re.match(self.dmatch['blank'], self.name['raw']) != None: 
# #             self.handle_parse_error('blank name field')
# #             self.name['format'] = 'error'
# #             return
        
# #         ## no spaces
# #         if re.match(self.dmatch['no_spaces'], self.name['norm']) != None: 
# #             self.handle_parse_error('no spaces')
# #             self.name['format'] = 'error'
# #             return
        
# #         ## error match list
# #         if self.name['norm'] in self.error_match_list:
# #             self.handle_parse_error('known error match')
# #             self.name['format'] = 'error'
# #             return
        
# #         ## illegal characters
# #         if re.match(self.dmatch['illegal'], self.name['norm']) != None: 
# #             self.handle_parse_error('illegal character')
# #             self.name['format'] = 'error'
# #             return  
        
# #         ## only initials
# #         if re.match(self.dmatch['only_initials'], self.name['norm']) != None: 
# #             self.handle_parse_error('only initials')
# #             self.name['format'] = 'error'
# #             return          
            
       
#         ## too long
#         #TODO        
        
#         # identify general names
#         if re.match(self.dmatch['names'], self.name['norm']) != None: 
                
#             # identify last names comma - leaf
#             if self.journal_format == '--names-last_names_comma': # extra check to filter out sentences
#                 if re.match(self.dmatch['names'], self.name['norm']) != None:
#                     self.name['format'] = 'last_names_comma'
                    
#             # identify names initials - leaf TODO
            
#             # identify initials names - leaf
#             elif re.match(self.dmatch['initials_names'], self.name['norm']) != None:
#                 self.name['format'] = 'initials_names'

            
#             # identify names initials names leaf
#             elif re.match(self.dmatch['names_initials_names'], self.name['norm']) != None:
#                 self.name['format'] = 'names_initials_names'

#             # identify names - leaf
#             elif re.match(self.dmatch['names'], self.name['norm']) != None:
#                 self.name['format'] = 'names'
            
#             else:
#                 self.handle_parse_error('unknown format')
#                 self.name['format'] = 'unknown'                
        
#         else:
#             self.handle_parse_error('unknown format')
#             self.name['format'] = 'unknown'

        
        
    
#     def strip_accents(self, s):
#         return ''.join(c for c in unicodedata.normalize('NFD', s)
#                       if unicodedata.category(c) != 'Mn')

#     def normalise_name(self, s):
#         s = self.strip_accents(s)     # remove diacritics
#         s= s.lower()                  # make lower case
#                                       # follow order:
#         s = s.replace('.', ' ')       # 1) remove dots from initials ? should this only be done where character . space
#         s = s.replace('  ', ' ')      # 2) remove double spaces
#         s = s.strip()                 # 3) strip white spaces
#         return s
    
    
#     def split_initials_block(self, N, n):

#         # finds a block of initials in the raw name (N) and adds spaces in the normalised name (n)
        
#         res = re.search(dmatch['initials_no_space'], N)
#         if res != None:
#             s0 = res.span(0)[0]
#             s1 = res.span(0)[1]
#             n = n[:s0] + (' '.join(list(n[s0:s1]))) + n[s1:]
        
#         return n                      
           
        
   
#     def handle_parse_error(self, error_type):
#         error = error_type #'{}: {}'.format(self.name_format, error_type)
#         self.name['error'] = error
#         #print('ERROR: {}'.format(error))

            
#     def add_last_name(self, names):
#         self.name['last_names'] = ' '.join(names)
#         self.name['last_names_array'] = names
        
    
#     def add_forenames(self, names):
#         for e in names:
#             self.name['forenames_array'].append(e)    # add name
#             self.name['initials_array'].append(e[0])  # and matching initial
            
        
#     def add_initials(self, initials):
#         for e in initials:
#             self.name['initials_array'].append(e[0])  # add initial
#             self.name['forenames_array'].append('')    # and space for name
            

#     def parse_last_names_comma_format(self):
        
#         dmatch = self.dmatch
        
#         n = self.name['norm']
        
#         # get multiple names before comma - ?assume they are all last name?
#         match = re.match(dmatch['names_comma'], n)
#         if match != None: 
#             new_name = n[:match.span(0)[1]-2].split() #???
#             self.add_last_name(new_name)
            
#             # update remaining name string
#             n = n[match.span(0)[1]:]
        
#         # if the name format does not match the journal type
#         else:
#             # TODO do we want this here
#             self.handle_parse_error('unknown format')
#             return 
        
#         # options:
#         # * names
#         # * initials
#         # * names initials
#         # * error

#         # if there are only names after the comma 
#         match = re.match(dmatch['only_names'], n)
#         if match != None:
#             new_names = n.split(' ')
#             self.add_forenames(new_names)
#             return 


#         # if there are only initials after the comma
#         match = re.match(dmatch['only_initials'], n)
#         if match != None:
#             new_initials = n.split(' ')
#             self.add_initials(new_initials)
            
#             return 

#         # if there are names then initials after the comma
#         match = re.match(dmatch['names_initials'], n)
#         if match != None:

#             #get the names
#             match = re.match(dmatch['names'], n)

#             new_names = n[:match.span(0)[1]].split(' ')
#             self.add_forenames(new_names)
            
#             # update remaining name string
#             n = n[match.span(0)[1]:].strip()

#             #then get the initials
#             new_initials = n.split(' ')
#             self.add_initials(new_initials)

#             return 

#         # if none of the above then error
#         # TODO Check these errors
#         self.handle_parse_error('unknown format')
#         return 
    
    
#     def parse_initials_name_format(self):
        
#         # where there are only initials followed by names - ?assume initials are for first and middle names 
#         # and the names are the last name
       
#         array_n = self.name['norm'].split()

#         # add the initials
#         for i, e in enumerate(array_n):
#             if len(e) == 1:
#                 self.add_initials([e]) 
#             else:
#                 break
       
#         # add last names
#         self.add_last_name(array_n[i:])
        
        
#     def parse_names_initials_name_format(self):
        
#         # where there are names followed by initials followed by names - ?assume last set of names are all last names
        
#         array_n = self.name['norm'].split()

#         # add first set of names
#         for i, e in enumerate(array_n):
#             if len(e) > 1:
#                 self.add_forenames([e]) 
#             else:
#                 break
                
#         # add the initials
#         for i, e in enumerate(array_n[i:]):
#             if len(e) == 1:
#                 self.add_initials([e]) 
#             else:
#                 break
       
#         # add last names
#         self.add_last_name(array_n[i+1:])
        
            
#     def parse_names_format(self):
        
#         # where there are only names ?assume all names that are not in prefix list or very last name are forenames?
        
#         # get array of names
#         array_n = self.name['norm'].split()
        
#         # get names upto last element or prefix name
#         index = 0
#         while index < len(array_n) - 1:
#             self.add_forenames([array_n[index]])
#             if array_n[index] in prefixes:
#                 break
                
#             index += 1  
            
#         self.add_last_name(array_n[index:])
       


In [None]:
# def test_split_name():

# test_array = [
    
#     # to check
#     [('def, a abc', 'def, a abc', None), ('unknown', 'unknown', 'unknown', 'unknown.names_comma_initials_names')],
    
    
#     [('def def a b c', 'def def a b c', None), ('unknown', 'a b c', 'def def', 'names_initials')],
#     [('abc def ghi jkl', 'abc def ghi jkl', None), ('abc def ghi', 'a d g', 'jkl', 'only_names')],
#     [('abc der van def', 'abc der van def', None), ('abc', 'a', 'der van def', 'only_names')],
#     [('a b c def', ' a b c def', None), ('unknown', 'a b c', 'def', 'initials_names')],
#     [('fgh, abc d e', ' fgh, abc d e', None), (' abc', 'a d e', 'fgh', 'names_comma_names_initials')],
#     [('abc abc d e fgh', 'abc abc d e fgh', None), ('abc abc', 'a a d e', 'fgh', 'names_initials_names')],
#     [('def def, abc abc', 'Def Def, abc abc', None), ('abc abc', 'a a', 'def def', 'names_comma_names')],
#     [('abc, acb, abc', 'abc, acb, abc', None), ('unknown', 'unknown', 'unknown', 'error.unknown_format')],
#     [('', '', None), ('unknown', 'unknown', 'unknown', 'error.blank')],
#     [('', ' ', None), ('unknown', 'unknown', 'unknown', 'error.blank')],
#     [('abc', 'abc', None), ('unknown', 'unknown', 'unknown', 'error.no_spaces')],
#     [('abc:', 'abc:', None), ('unknown', 'unknown', 'unknown', 'error.illegal_character')],
#     [('et al', 'et al', None), ('unknown', 'unknown', 'unknown', 'error.list')],
#     [('a b c', 'a b c', None), ('unknown', 'unknown', 'unknown', 'error.only_initials')],
#     [('abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc', 'ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ABC ', None), ('unknown', 'unknown', 'unknown', 'error.too_long')]
# #     [('mark jones', 'Mark Jones', None), ('mark', 'm', 'jones', 'names')]
# ]

# print('TESTING split_names')

# pass_flag = True

# for t in test_array:
#     intuple = t[0]
#     outtuple = t[1]
#     #print([intuple, outtuple])
#     restuple = split_name(intuple[0], intuple[1], intuple[2])
#     if restuple != outtuple:
#         pass_flag = False
#         print('FAIL')
#         print(intuple)
#         print('gave')
#         print(restuple)
#         print('should be')
#         print(outtuple)
#         print("*************")
#     else:
# #         print("PASS")
# #         print("*************")
#         pass

# if pass_flag:
#     print('PASS split_names')
# else:
#     print('TESTING COMPLETE ERRORS')