# ACRONYM: A CRitical Overview of word-play in europeaN research over 30 Years tiMe

In this notebook we tackle the highly important task of assessing the quality of project acronyms in Europe.

## Preamble

In [None]:
%run ../notebook_preamble.ipy

from sdg_mapping.cordis import load_cordis_projects, load_cordis_project_sdgs
from sdg_mapping.cordis.cordis_utils import FRAMEWORK_PROGRAMMES
from sdg_mapping.utils.sdg_utils import sdg_hex_color_codes, sdg_names
from sdg_mapping.sdg_index.sdg_index_utils import load_sdg_index

import os
import re
from itertools import chain
from collections import Counter

import tensorflow as tf
from umap import UMAP
import tensorflow_hub as hub
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances, euclidean_distances
import hashlib
import seaborn as sns

from fuzzywuzzy import process
from spacy.tokens import Doc
import spacy
from string import punctuation
import wordninja

nlp = spacy.load('en_core_web_sm')

fig_dir = os.path.join(project_dir, 'reports', 'analysis_cordis_sdg_index')
os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'

In [None]:
projects = []

for fp in FRAMEWORK_PROGRAMMES:
    projects.append(load_cordis_projects(fp).set_index('rcn'))
    
projects = pd.concat(projects)

In [None]:
project_h2020_df = projects

In [None]:
print('Total projects:', projects.shape[0])

In [None]:
# project_h2020_df = load_cordis_projects('h2020').set_index('rcn')
project_h2020_df.head()

## True Acronyms Only

### Must have an acronym

In [None]:
project_h2020_df = project_h2020_df.dropna(subset=['acronym'])

In [None]:
print('Total projects:', project_h2020_df.shape[0])

### Short Acronyms

We don't count acronyms with fewer than 3 characters.

In [None]:
fig, ax = plt.subplots()
ax.hist(project_h2020_df['acronym'].str.len(), bins=40)
ax.set_xlabel('Acronym Length')
ax.set_ylabel('Frequency');
# ax.axvline(2, color='C3', linestyle='--')
# ax.axvline(12, color='C3', linestyle='--');

In [None]:
project_h2020_df = project_h2020_df[project_h2020_df['acronym'].str.len() > 2]

### Removing Regular Names

We can see that many of the so-called project acronyms are in fact just regular names. To count as a true acronym the token must have been supplied in full upper case by the applicant. No other casing is permitted.

In [None]:
# project_h2020_df = project_h2020_df[project_h2020_df['acronym'].str.isupper()]

In [None]:
def percent_upper(acronym):
    uppers = len([a for a in acronym if a.isupper()])
    return uppers / len(acronym)

In [None]:
project_h2020_df['percent_upper'] = project_h2020_df['acronym'].apply(percent_upper)

In [None]:
fig, ax = plt.subplots()
project_h2020_df['percent_upper'].plot.hist(cumulative=True, bins=100, density='normed', histtype='step', ax=ax)
ax.set_xlim(0, 1)
ax.set_xlabel('Upper Case Fraction')
ax.set_ylabel('Cumulative Frequency (Norm)')
plt.tight_layout();

In [None]:
project_h2020_df['percent_upper'].describe()

In [None]:
print('Total projects with >50% upper case:', project_h2020_df[project_h2020_df['percent_upper'] > 0.5].shape[0])

### True Acronyms

Another criteria for acronyms is that all of the letters in the acronym are present in upper case in the title. This rules out instances where the apparent acronym is in fact simply a word from the title. For example the project with the acronym _STRUCTURALISM_ and title _The Roots of Mathematical Structuralism_ does not count as a true acronym because the acronym itself appears as a complete token in the title. On the other hand the title of the _SLATE_ project, *Submarine LAndslides and Their impact on Europe*, contains the letters of the acronym spread across multiple tokens. We consider this to satisfy the condition.

To check whether the acronym is indeed a true acronym of the title text, we check that the title contains sufficient upper case characters to form the acronym.

In [None]:
def contains_acronym(acronym, title):
    title = ''.join(t for t in title if t not in punctuation)
    acronym = ''.join(a for a in acronym if a not in punctuation)
    acronym = ''.join(a for a in acronym if not a.isdigit())
    title = title.replace(acronym, '')
    title = title.upper()
    r = '.*'.join(acronym.upper())
    is_in = re.findall(r, title)
    if len(is_in) > 0:
        return True
    else:
        return False

In [None]:
def percentage_upper_case_match(acronym, title):
    t_counts = Counter(title)
    acronym = ''.join(a for a in acronym if a.isupper())
    a_counts = Counter(acronym)
    
    counts = []
    for key, a_count in a_counts.items():
        t_count = t_counts[key]
        if t_count <= a_count:
            counts.append(t_count)
        elif t_count > a_count:
            counts.append(a_count)
    return np.sum(counts) / len(acronym)

In [None]:
u = (project_h2020_df.apply(lambda row: percentage_upper_case_match(row['acronym'], row['title']), axis=1))

In [None]:
plt.hist(u, bins=20, cumulative=True);

In [None]:
# def contains_acronym(acronym, title, min_ratio=80):
#     title = ''.join([t for t in title if t not in punctuation])
#     acronym = ''.join([a for a in acronym if t not in punctuation])
#     title = title.replace(acronym, '')
#     title_caps = ''.join([t for t in title if (t.isupper()) & (t in acronym)])
#     ratio = fuzz.ratio(acronym, title_caps)
#     if ratio >= min_ratio:
#         return True
#     else:
#         return False

In [None]:
project_h2020_df['contains_acronym'] = (project_h2020_df
                                  .apply(lambda row: contains_acronym(row['acronym'], row['title']), axis=1))
project_h2020_df = project_h2020_df[project_h2020_df['contains_acronym']]

In [None]:
project_h2020_df.shape[0]

### Substring Cheating

While we want the title to contain the requisite letters to form the supposed acronym, we also do not want the acronym (or substantial parts of it) to appear wholesale in the title.

**This is hard because many projects contain the acronym in the title as a clarification.** It's a bit of an edge case, so maybe we can leave it?

In [None]:
def is_honest(acronym, title, max_fuzz=80):
    for stub in ['{}:', '{} -', '({})', '( {} )', '{}-', '{} :', '- {}', ': {}', '[{}]']:
        title.replace(stub.format(acronym), '')
    title_doc = nlp(title)
    title_tokens = [t.lower_ for t in title_doc]
    title_doc = [t.lower_ for t in title_doc if len(t) > 2]
    fuzzes = process.extract(acronym.lower(), title_doc)
    if any([True if f[1] > max_fuzz else False for f in fuzzes]):
        return False
    else:
        return True

In [None]:
project_h2020_df['is_honest'] = (project_h2020_df
                                  .apply(lambda row: is_honest(row['acronym'], row['title']), axis=1))
project_h2020_df = project_h2020_df[project_h2020_df['is_honest']]

### Title Cheaters

There are many titles which are in all or mostly upper case. These people are clearly trying their luck, hoping that whatever acronym they have chosen will fortuitously arise from the characters in their title. A true acronym must be created with intention, so these are to be removed.

In [None]:
def percent_upper(text):
    return np.sum([t.isupper() for t in text]) / len(text)

In [None]:
title_percent_upper = project_h2020_df['title'].apply(percent_upper)

In [None]:
fig, ax = plt.subplots()
ax.hist(title_percent_upper, bins=50)
ax.set_xlabel('Fraction Upper Case')
ax.set_ylabel('Frequency');

In [None]:
max_thresh = .6
project_h2020_df = project_h2020_df[title_percent_upper <= max_thresh]

In [None]:
project_h2020_df.shape[0]

## Text Cleaning

It's important that we are picking up indirect semantic relationships rather than simply assessing whether the acronym is present in the text itself. We will remove any substring of the objective text that is equivalent to the acronym.

In [None]:
# def remove_acronym(acronym, text):
#     return (text
#             .replace(acronym, '')
#             .replace(acronym.title(), '')
#             .replace(acronym.upper(), '')
#             .replace(acronym.lower(), '')
#            )

def remove_acronym(acronym, text, threshold=80):
    doc = nlp.tokenizer(text)
    tokens = set([t.lower_ for t in doc])
    choices = process.extract(acronym.lower(), tokens, limit=10)
    removes = [c[0] for c in choices if c[1] >= threshold]
    words = [t.text for t in doc if t.lower_ not in removes]
    doc2 = Doc(doc.vocab, words=words)
    return doc2.text

In [None]:
project_h2020_df['text_mod'] = (project_h2020_df
                                .apply(lambda row: remove_acronym(row['acronym'], row['objective']), axis=1))

In [None]:
# project_h2020_df = project_h2020_df[~project_h2020_df['acronym'].str.contains(' ')]

In [None]:
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
# path = hashlib.sha1(module_url.encode("utf8")).hexdigest()
model = hub.load('/Users/grichardson/models/universal-sentence-encoder_4')

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

        
objective_embeddings = []
for chunk in chunks(project_h2020_df['text_mod'], 1000):
    objective_embeddings.append(model(chunk).numpy())
    
objective_embeddings = np.concatenate(objective_embeddings, axis=0)

In [None]:
acronym_embeddings = model(project_h2020_df['acronym'])
acronym_embeddings = acronym_embeddings.numpy()

In [None]:
dists = []
for ac, ob in zip(acronym_embeddings, objective_embeddings):
    dists.append(cosine(ac, ob))

In [None]:
project_h2020_df['dists'] = dists
project_h2020_df['sim'] = -1 * (project_h2020_df['dists'] -1)

In [None]:
fig, ax = plt.subplots()
ax.hist(dists, bins=50)
ax.set_xlabel('Cosine Distance')
ax.set_ylabel('Frequency');

In [None]:
fig, ax = plt.subplots()
ax.hist(project_h2020_df['sim'], bins=50)
ax.set_xlabel('Cosine Similarity')
ax.set_ylabel('Frequency');

## Analysis

### BestAcronyms

Sorting by distance we can see the acronyms that are most similar to the text of the project objective (without themselves appearing in the text).

Some good examples include:

- TECTONIC: The physics of Earthquake faulting: learning from laboratory earthquake prediCTiON to Improve forecasts of the spectrum of tectoniC failure modes
- ORCA: Optimizing Research tools for Cetaceans in Archaeology
- GATTACA: Genetics of Alternative Transcript Abundance upon immune Cellular Activation
- MAGMA: Melting And Geodynamic Models of Ascent

In [None]:
pd.set_option('display.max_colwidth', 100)

In [None]:
project_h2020_df.sort_values('dists')[['acronym', 'title']][:20]

### Worst Acronyms

At the other end of the spectrum, we have acronyms that have little semantic relationship to the objectives of the project. These include acronyms that aren't real words (or common acronyms for phrases) and common words that are found generally across many topics.

- IMPRESS: IMproving Preparedness and Response of HEalth Services in major crise
- SMOOTH: SMart rObOTs for fire-figHting
- HMCS: Handheld Molecular Contaminant Screener
- AWESOME: Advanced Wind Energy Systems Operation and Maintenance Expertise

In [None]:
(project_h2020_df
 .sort_values('dists', ascending=False)[['acronym', 'title']][:20])

### Acronym-Objective Semantic Overlap

In [None]:
umap = UMAP()
umap_ac_vecs = umap.fit_transform(acronym_embeddings)
umap_ob_vecs = umap.fit_transform(objective_embeddings)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(umap_ac_vecs[:, 1], umap_ac_vecs[:, 0], alpha=.02)
ax.scatter(umap_ob_vecs[:, 1], umap_ob_vecs[:, 0], alpha=.02)
ax.axis('off');

### Acronym Length and Similarity

In [None]:
project_h2020_df['acronym_length'] = project_h2020_df['acronym'].str.len()
project_h2020_df = project_h2020_df[project_h2020_df['acronym_length'] < 12]

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(10, 3.5))
sns.boxplot(data=project_h2020_df, x='acronym_length', y='sim', color='C0', ax=axs[0])
mean_length_sims = project_h2020_df.groupby('acronym_length')['sim'].mean()
axs[0].scatter(range(len(mean_length_sims)), mean_length_sims, zorder=5, color='white')
axs[0].set_xlabel('Acronym Length')
axs[0].set_ylabel('Acronym-Objective Similarity')
axs[1].hist(project_h2020_df['acronym_length'], bins=8)
axs[1].set_xlabel('Acronym Length')
axs[1].set_ylabel('Frequency')
plt.tight_layout();

### Start Share

The best acronyms are made exclusively from letters that appear only as the first character of words in the title. We calculate the fraction of letters in each aconym that meet this criteria as another metric of acronym quality.

In [None]:
def start_share(acronym, text):
    count = []
    i = 0
    words = text.split(' ')
    for a in acronym:
        for j, word in enumerate(words):
            if word.startswith(a):
                count.append(1)
                words = words[j+1:]
                break
    return np.sum(count) / len(acronym)

In [None]:
start_share_score = project_h2020_df.apply(lambda row: start_share(row['acronym'], row['title']), axis=1)
project_h2020_df['start_share'] = start_share_score

In [None]:
fig, ax = plt.subplots()
ax.hist(project_h2020_df['start_share'], bins=100, cumulative=True, histtype='step', linewidth=2, density='normed')
ax.set_xlabel('Title Acronymity')
ax.set_ylabel('Cumulative Frequency (Norm)')
ax.set_xlim(0, 1);

Surprisingly we see that over 70% of projects have an acronym that satisfies the condition.

In [None]:
fig, ax = plt.subplots()
ax.scatter(project_h2020_df['start_share'], project_h2020_df['sim'], alpha=.1)
ax.set_xlabel('Title Acronymity')
ax.set_ylabel('Acronym-Objective Similarity');

We see virtually no trend in the relationship between title acronymity and the text similarity. However it does permit us to define a new selecton criteria for the best acronyms - those which maximise both metrics.

### Acronym-Objective Similarity Over Time

In [None]:
project_h2020_df = project_h2020_df[(project_h2020_df['start_date'].dt.year <= 2020) 
                                    & (project_h2020_df['start_date'].dt.year >= 1990)]

In [None]:
grouper = pd.Grouper(freq='Y', key='start_date')

In [None]:
fig, ax = plt.subplots()
m = project_h2020_df.groupby(grouper)['sim'].mean().rolling(3).mean()
m.plot(linewidth=4)
s = project_h2020_df.groupby(grouper)['sim'].std().rolling(3).mean()
(s + m).plot(color='C0')
(m - s).plot(color='C0')
ax.set_xlabel('Start Year')
ax.set_ylabel('Acronym-Objective Similarity');

In [None]:
lens = []
years = []
for year, group in project_h2020_df.groupby(grouper)['acronym']:
    lens.append(group.str.len().mean())
    years.append(year)
    
length_time = pd.Series(data=lens, index=years)

### Acronym Lengths

In [None]:
fig, ax = plt.subplots()
m = length_time.rolling(3).mean()
std = length_time.rolling(3).mean()
# ax.errorbar(m.index, m, yerr=std)
ax.plot(m)
ax.set_xlabel('Start Year')
ax.set_ylabel('Mean Acronym Length')
plt.tight_layout();

### Fraction of Projects with Acronyms

In [None]:
acronym_counts = project_h2020_df.groupby(grouper)['dists'].count()
projects = projects[(projects['start_date'].dt.year <= 2020) 
                    & (projects['start_date'].dt.year >= 1990)]
project_counts = projects.groupby(grouper)['title'].count()

In [None]:
acronym_frac_time = (acronym_counts / project_counts) * 100

In [None]:
fig, ax = plt.subplots()
acronym_frac_time.rolling(3).mean().plot(ax=ax)
ax.set_xlabel('Start Year')
ax.set_ylabel('Projects with Acronym (%)')
plt.tight_layout();

### Acronyms by Country

In [None]:
def generate_eu_country_codes():
    country_df = pd.read_json(f'{data_path}/raw/countries/countries_restcountries_api.json')
    europe = []
    for code, c in zip(country_df['alpha2Code'], country_df['regionalBlocs']):
        for x in c:
            if x['acronym'] == 'EU':
                europe.append(code)
    
    # Britain called 'UK' in CORDIS
    europe = sorted(['UK' if e == 'GB' else e for e in europe])
    return europe

In [None]:
europe = generate_eu_country_codes()

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
project_h2020_df.groupby('coordinator_country')['sim'].mean().reindex(europe).dropna().sort_values().plot.barh(ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
(c / projects.groupby('coordinator_country')['title'].count().reindex(europe).dropna().sort_values() * 100).sort_values().plot.barh()

## Funding

Does the amount of funding correspond to the quality of the acronym?

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss = StandardScaler()

In [None]:
amounts = []
ids = []
for call, group in project_h2020_df.groupby('call'):
    if group.shape[0] >= 25:
        std = group['ec_max_contribution'].std()
        mean = group['ec_max_contribution'].mean()
        if std > 0:
            a = (group['ec_max_contribution'] - mean) / std
            ids.extend(group.index.values)
            amounts.extend(a)

# acro_fund_df = pd.DataFrame(data={'en': eng, 'non_en': non_eng})

In [None]:
amounts_normed = pd.Series(amounts, index=ids)

In [None]:
sns.regplot(project_h2020_df.loc[amounts_normed.index]['sim'], amounts_normed);

### Does English Score Higher?

In [None]:
from scipy.stats import ttest_ind, ttest_rel

In [None]:
en_uk = enchant.Dict("en_UK")
en_us = enchant.Dict("en_US")
fr = enchant.Dict("fr")

In [None]:
def is_english(acronym):
    if ' ' in acronym:
        acronym = acronym.split(' ')
    else:
        acronym = acronym.split('-')
    for a in acronym:
        if en_uk.check(a):
            return True
        elif en_uk.check(a):
            return True
    return False

In [None]:
project_h2020_df['is_english'] = project_h2020_df['acronym'].apply(is_english)

In [None]:
eng = []
non_eng = []
for call, group in project_h2020_df.groupby('call'):
    if group.shape[0] >= 25:
        std = group['ec_max_contribution'].std()
        mean = group['ec_max_contribution'].mean()
        if std > 0:
            amounts = (group['ec_max_contribution'] - mean) / std
            eng.extend(amounts[group['is_english']])
            non_eng.extend(amounts[~group['is_english']])

eng = np.array(eng)
eng = eng[~pd.isnull(eng)]
non_eng = np.array(non_eng)
non_eng = non_eng[~pd.isnull(non_eng)]

In [None]:
fig, ax = plt.subplots()
ax.hist(eng, bins=200, cumulative=True, density='normed', histtype='step')
ax.hist(non_eng, bins=200, cumulative=True, density='normed', histtype='step')
ax.set_xlim(-3, 3);

In [None]:
ttest_ind(eng, non_eng)

## Inspiration Index

1. Query google word2vec vectors for 30 most similar terms to each acronym
2. Use queries to create edgelist and build a network
3. Maybe threshold edges
4. Community detection
5. Use tf-hub model to calculate vector for the community
6. Look at distance between abstract text and community of acronym

In [None]:
a = 'DOG'
s = 'Doing Odd Gurns'

In [None]:
r = '.*'.join(a)
m = re.match(r, s)

In [None]:
m.pos

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
w2v = KeyedVectors.load_word2vec_format('/Users/grichardson/nesta/manifesto/data/external/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
def get_vector(t):
    try:
        vec = w2v.get_vector(t)
        return vec
    except:
        return None

In [None]:
from itertools import chain

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
def find_possible_acronyms(title, search_topn, return_topn=None, max_fuzz=80):

    title_doc = nlp(title)
    title_tokens = [t.lower_ for t in title_doc]
    title_doc = [t.lower_ for t in title_doc if len(t) > 2]
    title_vecs = []
    for t in title_doc:
        vec = get_vector(t)
        if vec is not None:
            title_vecs.append(vec)
    title_vecs = np.array(title_vecs)
    doc_vec = np.mean(title_vecs, axis=0)
    close_matches = w2v.similar_by_vector(doc_vec, topn=search_topn)
    close_matches = set(chain(*[[t.lower() for t in m[0].split('_') if len(t) > 3] for m in close_matches]))

    acronyms = []
    sims = []
    for candidate in close_matches:
        if '#' in candidate:
            continue
        fuzzes = process.extract(candidate, title_tokens)
        if any([True if f[1] > max_fuzz else False for f in fuzzes]):
            continue

        r = '.*'.join(candidate)
        is_in = re.findall(r, title.lower())
        if len(is_in) > 0:
            candidate_vec = get_vector(candidate)
            if candidate_vec is not None:
                acronyms.append(candidate)
                sims.append(1 - cosine(candidate_vec, doc_vec))
    acronyms = [(a, f'{s:.2f}') for s, a in sorted(zip(sims, acronyms), reverse=True)]

    return acronyms[:return_topn]

In [None]:
titles = project_h2020_df.sample(10).title.values

In [None]:
for title in titles:
    print(title)
    for result in find_possible_acronyms(title, 400, 10):
        print(result)
    print('===')