In [1]:
import itertools
import re
from typing import *

In [2]:
LEGAL_TERMS = {
    "company": ("Co", ),
    "limited partnership": ("lp", ),
    "limited liability partnership": ("llp", ),
    "limited liability limited partnership": ("lllp",),
    "limited liability company": ("llc", "lc", "ltd co", "ltd"),
    "professional limited liability company": ("pplc", ),
    'public limited company': ('plc', ),
    'limited': ('ltd', ),
    'corporation': ('corp', ),
    'private': ('pvt', ),
    'incorporated': ('inc', ),
    'incorporation': ('inc', ),
    'gesellschaft mit beschränkter haftung': ('gmbh', ),
    'aktiengesellschaft': ('AG', ),
}


OTHER_SUFFIXES = {
    "association",
    "organization"
}

ALREADY_ABBREVIATIONS = set(itertools.chain(*LEGAL_TERMS.values()))

WEAK_WORDS = {"of", "the", "for"}

EXTRA_INFORMATION_REGEX = re.compile(r"\((.+)\)")

In [3]:
#basic abbreviation - first letters, and, WEAK_WORDS
def abbreviations_as_first_letters(words: List[str]):
    if not words:
        yield ""
        return

    for abbr in abbreviations_as_first_letters(words[1:]):
        word = words[0]
        if word in WEAK_WORDS:
            yield abbr
        if word == "and":
            yield abbr
            yield "&" + abbr
        yield word[0].upper() + abbr

In [4]:
#popular suffixes that sometimes are abbreviated and sometimes not

def _split_other_suffixes(name: str) -> Tuple[str, Sequence[str]]:
    for suffix in OTHER_SUFFIXES:
        if name.endswith(suffix):
            new_name = name[:-len(suffix)].strip()
            even_shorter_name, other_suffixes = _split_other_suffixes(new_name)
            return even_shorter_name, other_suffixes + [suffix]
    return name, []

def take_care_of_other_suffixes(name):
    name, other_suffixes = _split_other_suffixes(name)
    abbrv = list(abbreviations_as_first_letters(name.split()))
    
    all_results, whole_results = [], []
    for abbr in abbrv:
        for drop_limit in range(len(other_suffixes) + 1):
            for abbreviation_limit in range(drop_limit + 1):
                for abbr2 in abbreviations_as_first_letters(other_suffixes[:abbreviation_limit]):
                    value = " ".join((abbr + abbr2, *other_suffixes[abbreviation_limit:drop_limit])).strip()
                    all_results.append(value)
                    if drop_limit == len(other_suffixes):
                        whole_results.append(value)
    
    return all_results, whole_results

In [5]:
# legal suffixes

def _split_legal_terms(name: str) -> Tuple[str, Sequence[str]]:
    possible_splits = []
    for legal_term in LEGAL_TERMS:
        if name.endswith(legal_term):
            possible_splits.append((name[:-len(legal_term)].strip(), legal_term))
    
    for legal_abbreviation in ALREADY_ABBREVIATIONS:
        if name.endswith(legal_abbreviation):
            possible_splits.append((name[:-len(legal_abbreviation)].strip(), legal_abbreviation))

    if not possible_splits:
        return name, []
    
    best_split = sorted((len(x[0]), x) for x in possible_splits)[0][1]

    main_name, new_legal_terms = _split_legal_terms(best_split[0])
    return main_name, new_legal_terms + [best_split[1]]

def _generate_all_legal_suffixes(legal_suffixes):
    legal_suffixes_possibilities = [(legal_suffix, *LEGAL_TERMS.get(legal_suffix, [])) for legal_suffix in legal_suffixes]
    return  [
        " ".join(legal_abbreviations) for legal_abbreviations
        in itertools.product(*legal_suffixes_possibilities)
    ]

def take_care_of_legal(name):
    name, legal_suffixes = _split_legal_terms(name)
    all_results, whole_results = take_care_of_other_suffixes(name)
    
    legal_results = [
        (i + " " + j).strip()
        for i, j in itertools.product(whole_results, _generate_all_legal_suffixes(legal_suffixes)) if j
    ]
    
    legal_results += [
        i + legal_suffixes[0][0].upper() 
        for i in whole_results if " " not in i and len(i) > 1
    ] if len(legal_suffixes) == 1 else []
    
    all_results = all_results + legal_results
    whole_results = legal_results if legal_suffixes else whole_results

    return all_results, whole_results


In [6]:
# extra information in parenthesis
def _extract_extra_information(name):
    found = list(EXTRA_INFORMATION_REGEX.finditer(name))
    if found:
        span = found[-1].span()
        return name[:span[0]].strip(), name[span[0]:span[1]].strip(), name[span[1]:].strip()
    else:
        return name, None, None

def take_care_of_extra(name):
    all_results, whole_results = [], []
    
    left_part, extra_information_group, right_part = _extract_extra_information(name)
    if extra_information_group:
        left_result = take_care_of_extra(left_part)
        right_result = take_care_of_legal(right_part)
        for left_abbr, right_abbr in itertools.product(left_result[1], right_result[0]):
            all_results += [
                " ".join((left_abbr, extra_information_group, right_abbr)).strip(),
                " ".join((left_abbr, right_abbr)).strip()
            ]
        for left_abbr, right_abbr in itertools.product(left_result[1], right_result[1]):
            whole_results += [
                " ".join((left_abbr, extra_information_group, right_abbr)).strip(),
                " ".join((left_abbr, right_abbr)).strip()
            ]
        return all_results, whole_results
    else:
        return take_care_of_legal(name)
    

In [7]:

   
def generate(name: str) -> Generator[str, None, None]:
    name = name.lower().replace(",", " ").replace(".", "")
    abbreviations = take_care_of_extra(name)[0]
            
    for abbreviation in abbreviations:
        if len(abbreviation) < 2 or len(abbreviation.split()[0]) < 2:
            continue

        yield abbreviation.lower()
        if len(set(abbreviation)) == 1:
            yield str(len(abbreviation)) + abbreviation[0].lower()
        
    

In [8]:
test_cases = {
    "Government Employees Insurance Company": "GEICO",  #maybe special case?
    "Head, Heart, Hands, Health": "4H",
    "American Consultants League": "ACL",
    "Amyotrophic Lateral Sclerosis Association": "ALS Association",
    "Council of Actions United for Service Efforts": "CAUSE",
    "Conference of Minority Public Administrators": "COMPA",
    "The Minnesota Mining and Manufacturing Company": "3M",
    "Creative Information Technology, Inc.": "CITI", #https://pl.linkedin.com/company/creative-information-techology-inc
    "International Business Machines Corporation": "IBM",
    "British Overseas Airways Corporation": "BOAC"
}

special_cases = {
    "International Crime Police Organization": "INTERPOL",
    "United Nations Children’s Fund": "UNICEF",
    "Henry and Richard Block": "H&R Block",
    "Shoulder of Pork and Ham": "SPAM",
    "Transport for Elderly and Disabled Persons": "TRANSED",
}

In [9]:
ALL = True

for test in test_cases:
    possibilities = set(generate(test))
    should_be = test_cases[test].lower()
    if ALL or not should_be in possibilities:
        print(test.ljust(60), ",".join(possibilities))

Government Employees Insurance Company                       geic,gei co,gei company,gei
Head, Heart, Hands, Health                                   hhhh,4h
American Consultants League                                  acl
Amyotrophic Lateral Sclerosis Association                    als association,als,alsa
Council of Actions United for Service Efforts                coause,coaufse,caufse,cause
Conference of Minority Public Administrators                 compa,cmpa
The Minnesota Mining and Manufacturing Company               mmamc,3m,mmam company,mmm co,tmm&m company,mm&m co,tmmm,tmmmc,mmam,tmmam,tmmamc,tmmm company,mmm,mmmc,tmmam co,mm&m company,mm&m,tmm&m,mmm company,mm&mc,tmmm co,tmm&mc,mmam co,tmmam company,tmm&m co
Creative Information Technology, Inc.                        citi,cit,cit inc
International Business Machines Corporation                  ibm corporation,ibm,ibm corp,ibmc
British Overseas Airways Corporation                         boa,boac,boa corp,boa corporation


In [10]:
for test in special_cases:
    possibilities = set(generate(test))
    should_be = special_cases[test].lower()
    print(test.ljust(60), should_be.ljust(20), ",".join(possibilities))

International Crime Police Organization                      interpol             icp,icp organization,icpo
United Nations Children’s Fund                               unicef               uncf
Henry and Richard Block                                      h&r block            hrb,harb,h&rb
Shoulder of Pork and Ham                                     spam                 sopah,sop&h,sph,soph,spah,sp&h
Transport for Elderly and Disabled Persons                   transed              tfe&dp,tedp,tfeadp,te&dp,tfedp,teadp


In [11]:
assert list(generate("GOOGLE")) == [], list(generate("GOOGLE"))
assert list(generate("nordia, inc.")) == [], list(generate("nordia, inc"))

In [12]:
set(generate("TOYOTA MOTOR FINANCE (CHINA) COMPANY LIMITED"))

{'tmf',
 'tmf (china)',
 'tmf (china) co limited',
 'tmf (china) co ltd',
 'tmf (china) company limited',
 'tmf (china) company ltd',
 'tmf co limited',
 'tmf co ltd',
 'tmf company limited',
 'tmf company ltd'}