# Word Embeddings for World Bank ICR Reports

In [None]:
import re
import glob
import os
import pickle
from collections import Counter

from matplotlib import pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_validate
from numpy import mean
from numpy import std

import spacy
from tqdm import tqdm

# from utils import FILES, FILE2ID, FILE2SECTOR, read_file
nltk.download('punkt')
# nlp = spacy.load("en_core_web_sm",  disable=["tagger", "ner", "parser"])
# nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# copy reports
!cp /content/drive/MyDrive/WorldBank/* .

# unzip reports
!unzip -q icr_text_docs.zip -d icr
!mv icr/documents/* icr/ && rm -rf icr/documents *.zip

In [None]:
!ls

In [None]:
# Create lookup dicts
FILES = glob.glob("icr/P0*_0*-*")

def file2id(filename):
    assert 'icr/' in filename
    return filename[4:11]

FILE2ID = {file: file2id(file) for file in FILES}

# Get sectors lookup and make dataframe
sector_df = pd.read_csv('clean_dli_pdo_embeds_sector.csv')
sector_df.parent_sector_name = sector_df.parent_sector_name.fillna('None') # replace nan

ID2SECTOR = {}
for projectid, sector_name in sector_df[['id','parent_sector_name']].values:
    ID2SECTOR[projectid] = sector_name    

FILE2SECTOR = {file: ID2SECTOR[FILE2ID[file]] for file in FILES}

def file2words(file):
    """Extract words as tokens from file with nltk, lemmatize, remove stop words and filter"""
    for encoding in ['utf-8', 'iso-8859-15']:
        try:
            with open(file, 'r', encoding=encoding) as f:
                text = f.read()
        except UnicodeDecodeError:
            continue
            
    valid = [w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() in WORDS and len(w) > 2]

    # lemmatize
    valid = [lemmatizer.lemmatize(w) for w in valid]

    # remove stop words
    valid = [w for w in valid if w not in STOPWORDS]
    return valid

def get_most_common(words):
    """Get most common words"""
    fdist1 = nltk.FreqDist(words)
    filtered_word_freq = dict((word, freq) for word, freq in fdist1.items() if not word.isdigit())
    c = Counter(filtered_word_freq)
    return c.most_common()

    
def read_file(file):
    for encoding in ['utf-8', 'iso-8859-15']:
        try:
            with open(file, 'r', encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError

def project(embeddings, dims=2):
    X = embeddings
    pca = PCA(n_components=dims)
    projections = pca.fit_transform(X)
    if dims == 2:
        PC1, PC2 = np.hsplit(projections, 2)
        return {'PC1': PC1.flatten(), 'PC2': PC2.flatten()}
    elif dims == 3:
        PC1, PC2, PC3 = np.hsplit(projections, 3)
        return {'PC1': PC1.flatten(), 'PC2': PC2.flatten(), 'PC3': PC3.flatten()}

def clean_sentences(text):
  sentences = nltk.tokenize.sent_tokenize(text)

  # Clean up sentences from puctuation
  cleaned = []
  for sentence in sentences:
      # split into words
      tokens = nltk.tokenize.word_tokenize(sentence)
      # remove all tokens that are not alphabetic
      clean_sentence = " ".join(word for word in tokens if word.isalpha() and len(word) > 2)
      cleaned.append(clean_sentence)
  return cleaned

## TF-IFD

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)

dfs = []
for file in FILES:
  text = read_file(file)
  dfs.append({'text': text, 'file': file})
df = pd.DataFrame(dfs)
df['sector'] = df.file.apply(lambda x: FILE2SECTOR[x])
df.sector = df.sector.astype('category')

# clean sectors column
sectors = [x for x in df.sector.unique() if not x.startswith('(H)')]
df = df[df.sector.isin(sectors)]

# get PCs
pcs = pd.DataFrame({'sector':df.sector, 'project': df.file.apply(lambda x: FILE2ID[x]), **project(X, dims=3)})

## 1) Lower
df["text"] = df["text"].str.lower()
## 2) Remove tags
df["text"] = df.apply(lambda x: re.sub("<[^>]*>", "", x["text"]), axis=1)
## 3) Tokenize
df["text_proc"] = df.apply(lambda x: word_tokenize(x["text"]), axis=1)
## 4) Remove punctuation
df["text_proc"] = df.apply(lambda x: [w.translate(table) for w in x["text_proc"]], axis=1)
## 5) Remove non-alpha
df["text_proc"] = df.apply(lambda x: [w for w in x["text_proc"] if w.isalpha()], axis=1)
## 6) Remove stop-words  

df["text_proc"] = df.apply(lambda x: [w for w in x["text_proc"] if not w in stop_words], axis=1)
## 7) Reformat to have a single text. 
df["text_proc_res"] = df.apply(lambda x: ' '.join(x["text_proc"]), axis=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vec = TfidfVectorizer(stop_words='english', max_df = 0.95, min_df=2, max_features=1000)
x = vec.fit_transform(df["text_proc_res"])

print(x.shape)
# reduce dimensionality
svd = TruncatedSVD(n_components=100)
res = svd.fit_transform(x)

res.shape

In [None]:
from sklearn import svm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from numpy import mean
from numpy import std
df.sector = df.sector.astype('category')
y = df["sector"].values
X = res
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo')

metrics = cross_validate(model, X, y, scoring=['precision_macro', 'recall_macro'], cv=cv, n_jobs=-1)

print('Precision: %.3f (%.3f)' % (mean(metrics["test_precision_macro"]), std(metrics["test_precision_macro"])))
print('Recall: %.3f (%.3f)' % (mean(metrics["test_recall_macro"]), -std(metrics["test_recall_macro"])))

In [None]:
fig = plotly.subplots.make_subplots(rows=1, cols=1)

sectors = sorted([x for x in pcs.sector.unique() if not x.startswith('(H)')])
focus_sectors = ['Education', 'Health' ,'Water/Sanit/Waste', ]

for sector in sectors:
    sector_df = pcs[pcs['sector'] == sector]
    if not len(sector_df.values):
        print(f"Skipping {sector}, no matches found")
        continue        

    fig.add_trace(
        go.Scatter3d(mode='markers',
                   x=sector_df.PC1, y=sector_df.PC2, 
                   z=sector_df.PC3,
                   text=sector_df.project,
                   marker=dict(
                           size=10,
                   ),
  
                   name = sector_df.sector.values[0],
                   hovertemplate = '%{text}',
                    )
        )


fig.update_layout(
    height=800,
    # xaxis_title="PC1",
    # yaxis_title="PC2",
    # zaxis_title="PC3",
    title_text=f'World Bank ICR Reviews Term Frequency-Inverse Document Frequency'
)
fig.update_traces(textposition='top center')

In [None]:
fig = plotly.subplots.make_subplots(rows=1, cols=1)

sectors = sorted([x for x in pcs.sector.unique() if not x.startswith('(H)')])
focus_sectors = ['Education', 'Health' ,'Water/Sanit/Waste', ]

for sector in focus_sectors:
    sector_df = pcs[pcs['sector'] == sector]
    if not len(sector_df.values):
        print(f"Skipping {sector}, no matches found")
        continue        

    fig.add_trace(
        go.Scatter(mode='markers',
                   x=sector_df.PC1, y=sector_df.PC2, 
                  #  z=sector_df.PC3,
                   text=sector_df.project,
                   marker=dict(
                           size=10,
                   ),
  
                   name = sector_df.sector.values[0],
                   hovertemplate = '%{text}',
                    )
        )


fig.update_layout(
    height=800,
    # xaxis_title="PC1",
    # yaxis_title="PC2",
    # zaxis_title="PC3",
    title_text=f'World Bank ICR Reviews\nTerm Frequency-Inverse Document Frequency Embeddings'
)
fig.update_traces(textposition='top center')

## Topic Modeling

In [None]:
from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
n_features = 1000
n_components = 10
n_top_words = 20
n_samples = len(df)

def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

t0 = time()
data = df.text_proc_res.values
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
               'Topics in NMF model (Frobenius norm)')

# Fit the NMF model
print('\n' * 2, "Fitting the NMF model (generalized Kullback-Leibler "
      "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
               'Topics in NMF model (generalized Kullback-Leibler divergence)')

print('\n' * 2, "Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names()
plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')

In [None]:
from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_features = 1000
n_components = 4
n_top_words = 20
n_samples = len(df)

def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

t0 = time()
data = df.text_proc_res.values
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
               'Topics in NMF model (Frobenius norm)')

# Fit the NMF model
print('\n' * 2, "Fitting the NMF model (generalized Kullback-Leibler "
      "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
               'Topics in NMF model (generalized Kullback-Leibler divergence)')

print('\n' * 2, "Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names()
plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')

## Siamese Sentence Encoding

In [None]:
EMBS_PATH = 'siamese_bert_report_albert_sent+embs.pk'
icr_sentences = {}
if not os.path.exists(EMBS_PATH):  
    print("Generating embeddings")
    model = SentenceTransformer('sentence-transformers/paraphrase-albert-small-v2')
    report_embs = []
    for file in tqdm(FILES):
        text = read_file(file)   
        # document = Doc(text)
        cleaned_sentences = [x for x in clean_sentences(text) if len(x)]
        icr_sentences[file] = cleaned_sentences
        # sentences = [x.sent.text for x in a if len(x) > 15] # remove stubs
        #Sentences are encoded by calling model.encode()
        embeddings = model.encode([x for x in cleaned_sentences], batch_size=128)
    #     PCs = project(embeddings, dims=3)
    #     file_vecs = pd.DataFrame({'sentence': sentences, 'file': file, 'embedding': embeddings})
        data = [{'file': file, 'embedding': embeddings[idx], 'sentence': sent} for idx, sent in enumerate(cleaned_sentences)]    
        report_embs.extend(data)
    pickle.dump(report_embs, open(EMBS_PATH, 'wb'))
    !cp $EMBS_PATH /content/drive/MyDrive/WorldBank/$EMBS_PATH
else:
    report_embs = pickle.load(open(EMBS_PATH, 'rb'))

# df = pd.concat(sent_vecs)
# df.file = df.file.astype('category')
# df.to_csv('siamese_bert_sent_vecs_pca.csv')
# df = pd.read_csv('sent_vecs_all.csv')

In [None]:
df = pd.DataFrame(report_embs)
df['sector'] = df.file.apply(lambda x: FILE2SECTOR[x])

df.file = df.file.astype('category')
df.sector = df.sector.astype('category')

all_embeddings = np.vstack(df.embedding.values)
df = pd.concat([df, pd.DataFrame(project(all_embeddings, dims=3))], axis=1)
df['project'] = df.file.apply(lambda x: FILE2ID[x])

drop_sectors = [s for s in df.sector.unique() if s.startswith('(H)')]
df.drop(df[df.sector.isin(drop_sectors)].index, inplace=True)

### Report Sentence Embeddings
One data point per sentence

In [None]:
sectors = ['Education', 'Health' ,'Water/Sanit/Waste', ]
# sectors = [x for x in df.sector.unique() if not x.startswith('(H)')]
fig = plotly.subplots.make_subplots(rows=1, cols=1)

for sector in tqdm(sorted(sectors)):
    sector_df = df[df['sector'] == sector]
    if not len(sector_df.values):
        print(f"Skipping {sector}, no matches found")
        continue        

    fig.add_trace(
        go.Scatter3d(mode='markers',
                   x=sector_df.PC1, y=sector_df.PC2, 
                   z=sector_df.PC3,
                   text=sector_df.sentence,
                   marker=dict(
#                            opacity=0.5,
    #                          color=2,
                           size=10,
    #                         colorscale='Viridis',
    #                         line_width=1
                   ),
    #                    customdata = np.dstack((sector_df.sector.values, sector_df.report_id.values)),
                   name = sector_df.sector.values[0],
                   hovertemplate = '%{text}',
    #                      <br>Report: %{customdata[1]}',
#                          fill="toself",
    #                visible='legendonly'
                    )
        )


fig.update_layout(
    height=800,
    # xaxis_title="PC1",
    # yaxis_title="PC2",
    # zaxis_title="PC3",
    title_text=f'World Bank ICR Reviews'
)
fig.update_traces(textposition='top center')

### Report Mean Embeddings
One embedding per report

In [None]:
report_mean_embeddings = []
for file, group in df.groupby('file'):
    if group.empty:
        continue
    mean_embedding = group.embedding.values.mean(0)
    report_mean_embeddings.append({'file': file, 'mean_embedding': mean_embedding, 'sector': group.sector.values[0], 'project': group.project.values[0]})

df = pd.DataFrame(report_mean_embeddings)

all_embeddings = np.vstack(df.mean_embedding.values)
df = pd.concat([df, pd.DataFrame(project(all_embeddings, dims=3))], axis=1)

drop_sectors = [s for s in df.sector.unique() if s.startswith('(H)')]
df.drop(df[df.sector.isin(drop_sectors)].index, inplace=True)

df.file = df.file.astype('category')
df.sector = df.sector.astype('category')

### Focus sectors only

In [None]:
sectors = ['Education', 'Health', 'Water/Sanit/Waste']
# sectors = [x for x in df.sector.unique() if not x.startswith('(H)')]
fig = plotly.subplots.make_subplots(rows=1, cols=1)

for sector in sorted(sectors):
    group = df[df.sector==sector]
    fig.add_trace(
        go.Scatter3d(mode='markers',
                   x=group.PC1, y=group.PC2, 
                   z=group.PC3,
                   text=group.project,
                   marker=dict(
                           size=10,
                   ),
                   name = sector,
                   hovertemplate = '%{text}',
                    )
        )

fig.update_layout(
    height=800,
    title_text=f'World Bank ICR Reviews'
)
fig.update_traces(textposition='top center')

In [None]:
# get sector embedding means
sector_embs = {}

for sector,g in df.groupby('sector', as_index=False):
  if not g.empty:
    sector_embs[sector] = g.mean_embedding.values

mean_sector_embs = {}
for sector in df.sector.unique():
  mean_sector_embs[sector] = np.vstack(sector_embs[sector]).mean(axis=0)

df['sector_mean'] = [mean_sector_embs[sector] for sector in df.sector]

In [None]:
def dist_from_centroid(mean_embedding, sector_mean):
  dist = np.linalg.norm(np.vstack(df.mean_embedding.values) - np.vstack(df.sector_mean.values), axis=1)
  assert dist.shape == mean_embedding.shape
  return dist

df['dist_from_centroid'] = dist_from_centroid(df['mean_embedding'], df['sector_mean'])

In [None]:
import scipy
sector_z = {}
for sector in df.sector.unique():
  z = np.abs(scipy.stats.zscore(df[df.sector==sector].dist_from_centroid))
  sector_z[sector] = z

In [None]:

def get_sector_zscore(grp):
  print(grp['sector'].values[0])
  sector = grp['sector'].values[0]
  zscore = np.abs(scipy.stats.zscore(grp))  
  return zscore

In [None]:
# df['z_score'] = np.nan
# for sector, group in df.groupby('sector'):
#   df['z_score'] = df.dist_from_centroid.apply(lambda x: np.abs(scipy.stats.zscore(x)))
df = df.groupby('sector').apply(lambda grp: grp.assign(zscore=np.abs(scipy.stats.zscore(grp.dist_from_centroid))))

In [None]:
sector_mads = {sector: scipy.stats.median_absolute_deviation(df[df.sector == sector].dist_from_centroid) for sector in df.sector.unique()}
print("{:>20} {:<10}".format("Sector", "MAD"))
for sector in sector_mads:
  print(f'{sector :>20} {sector_mads[sector]:<10.2f}')

In [None]:
# df.dist_from_centroid.plot.density()
import seaborn as sns
from matplotlib.cbook import boxplot_stats

# box plot of the variable height
ax = sns.boxplot(df.dist_from_centroid)

fliers = boxplot_stats(df.dist_from_centroid).pop(0)['fliers']
outliers = [y for stat in boxplot_stats(df['dist_from_centroid']) for y in fliers]
# notation indicating an outlier
# for y in outliers:
  # ax.annotate('Outlier', xy=(y,0), xytext=(186,-0.05), fontsize=14,
              # arrowprops=dict(arrowstyle='->', ec='grey', lw=2), bbox = dict(boxstyle="round", fc="0.8"))

# for y in outliers:
#     ax.plot(1, y, 'p')
# ax.set_xlim(right=1.5)

# xtick, label, and title
plt.xticks(fontsize=14)
plt.xlabel('distance from sector centroid', fontsize=14)
plt.title('Distribution of distances', fontsize=20)

In [None]:
sectors = ['Education', 'Health', 'Water/Sanit/Waste']
# sectors = [x for x in df.sector.unique() if not x.startswith('(H)')]
fig = plotly.subplots.make_subplots(rows=1, cols=1)

for sector in sorted(sectors):
    group = df[df.sector==sector]
    fig.add_trace(
        go.Scatter3d(mode='markers',
                   x=group.PC1, y=group.PC2, 
                   z=group.PC3,
                   text=[f'Project: {row.project}<br>Sector Z-score: {row.dist_from_centroid:.3f}' for ind, row in group.iterrows()],
                   marker=dict(
                           size=10,
                           color=group.zscore,
                           cmin=0,
                           cmax=df.zscore.max(),
                           colorbar=dict(
                             title="Z-Score"
                           ),
                           colorscale='jet',
                   ),
                   name = sector,                   
                   hovertemplate = "%{text}",
                    )
        )

fig.update_layout(
    height=800,
    title_text=f'World Bank ICR Review Embeddings<br>Color by sector Z-Score to detect outliers',
    # coloraxis_colorbar_x=-0.15,
    legend=dict(
      yanchor="top",
      y=0.99,
      xanchor="left",
      x=0.01
    )
)
fig.update_traces(textposition='top center')

In [None]:
sectors = ['Education', 'Health', 'Water/Sanit/Waste']
# sectors = [x for x in df.sector.unique() if not x.startswith('(H)')]
fig = plotly.subplots.make_subplots(rows=1, cols=1)

for sector in df.sector.unique():
    group = df[df.sector==sector]
    fig.add_trace(
        go.Scatter3d(mode='markers',
                   x=group.PC1, y=group.PC2, 
                   z=group.PC3,
                   text=[f'Project: {row.project}<br>Sector Z-score: {row.dist_from_centroid:.3f}' for ind, row in group.iterrows()],
                   marker=dict(
                           size=10,
                           color=group.zscore,
                           cmin=0,
                           cmax=df.zscore.max(),
                           colorbar=dict(
                             title="Z-Score"
                           ),
                           colorscale='jet',
                   ),
                   name = sector,                   
                   hovertemplate = "%{text}",
                    )
        )

fig.update_layout(
    height=800,
    title_text=f'World Bank ICR Review Embeddings<br>Color by sector Z-Score to detect outliers',
    # coloraxis_colorbar_x=-0.15,
    legend=dict(
      yanchor="top",
      y=0.99,
      xanchor="left",
      x=0.01
    )
)
fig.update_traces(textposition='top center')

In [None]:
sectors = ['Education', 'Health', 'Water/Sanit/Waste']
# sectors = [x for x in df.sector.unique() if not x.startswith('(H)')]
fig = plotly.subplots.make_subplots(rows=1, cols=1)

for sector in sorted(sectors):
    group = df[df.sector==sector]
    fig.add_trace(
        go.Scatter3d(mode='markers',
                   x=group.PC1, y=group.PC2, 
                   z=group.PC3,
                   text=[f'Project: {row.project}%\nDistance from centroid: {row.dist_from_centroid.mean():.3f}' for ind, row in group.iterrows()],
                   marker=dict(
                           size=10,
                   ),
                   name = sector,                   
                   hovertemplate = "%{text}",
                    )
        )

fig.update_layout(
    height=800,
    title_text=f'World Bank ICR Reviews'
)
fig.update_traces(textposition='top center')

### All sectors

In [None]:
sectors = [x for x in df.sector.unique() if not x.startswith('(H)')]
fig = plotly.subplots.make_subplots(rows=1, cols=1)

for sector in sorted(sectors):
    group = df[df.sector==sector]
    fig.add_trace(
        go.Scatter3d(mode='markers',
                   x=group.PC1, y=group.PC2, 
                   z=group.PC3,
                   text=group.project,
                   marker=dict(
                           size=10,
                   ),
                   name = sector,
                   hovertemplate = '%{text}',
                    )
        )

fig.update_layout(
    height=800,
    title_text=f'World Bank ICR Reviews'
)
fig.update_traces(textposition='top center')

In [None]:
df.sector = df.sector.astype('category')
y = df["sector"].values
X = np.vstack(df.mean_embedding.values)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo')

metrics = cross_validate(model, X, y, scoring=['precision_macro', 'recall_macro'], cv=cv, n_jobs=-1)

print('Precision: %.3f (%.3f)' % (mean(metrics["test_precision_macro"]), std(metrics["test_precision_macro"])))
print('Recall: %.3f (%.3f)' % (mean(metrics["test_recall_macro"]), -std(metrics["test_recall_macro"])))

In [None]:
# ## for bag-of-words
# from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

# dtf = df.copy()

# ## split dataset
# dtf_train, dtf_test = model_selection.train_test_split(dtf, test_size=0.3, random_state=1)

# X_train = np.vstack(dtf_train.mean_embedding.values)
# X_test = np.vstack(dtf_test.mean_embedding.values)

# ## get target
# y_train = dtf_train["sector"].values
# y_test = dtf_test["sector"].values

# #Create a svm Classifier
# clf = svm.SVC(kernel='linear') # Linear Kernel

# #Train the model using the training sets
# clf.fit(X_train, y_train)

# #Predict the response for test dataset
# y_pred = clf.predict(X_test)

# # Model Accuracy: how often is the classifier correct?
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# # Model Precision: what percentage of positive tuples are labeled as such?
# print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))

# # Model Recall: what percentage of positive tuples are labelled as such?
# print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))

In [None]:
# ## Accuracy, Precision, Recall
# accuracy = metrics.accuracy_score(y_test, predicted)
# auc = metrics.roc_auc_score(y_test, predicted_prob, 
#                             multi_class="ovr")
# print("Accuracy:",  round(accuracy,2))
# print("Auc:", round(auc,2))
# print("Detail:")
# print(metrics.classification_report(y_test, predicted))
    
# ## Plot confusion matrix
# cm = metrics.confusion_matrix(y_test, predicted)
# fig, ax = plt.subplots()
# sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, 
#             cbar=False)
# ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, 
#        yticklabels=classes, title="Confusion matrix")
# plt.yticks(rotation=0)

# fig, ax = plt.subplots(nrows=1, ncols=2)
# ## Plot roc
# for i in range(len(classes)):
#     fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],  
#                            predicted_prob[:,i])
#     ax[0].plot(fpr, tpr, lw=3, 
#               label='{0} (area={1:0.2f})'.format(classes[i], 
#                               metrics.auc(fpr, tpr))
#                )
# ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
# ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
#           xlabel='False Positive Rate', 
#           ylabel="True Positive Rate (Recall)", 
#           title="Receiver operating characteristic")
# ax[0].legend(loc="lower right")
# ax[0].grid(True)
    
# ## Plot precision-recall curve
# for i in range(len(classes)):
#     precision, recall, thresholds = metrics.precision_recall_curve(
#                  y_test_array[:,i], predicted_prob[:,i])
#     ax[1].plot(recall, precision, lw=3, 
#                label='{0} (area={1:0.2f})'.format(classes[i], 
#                                   metrics.auc(recall, precision))
#               )
# ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
#           ylabel="Precision", title="Precision-Recall curve")
# ax[1].legend(loc="best")
# ax[1].grid(True)
# plt.show()

In [None]:
dtf = pd.DataFrame(report_embs).groupby(['file'], as_index=False).agg({'sentence': ' '.join})

dtf['sector'] = dtf.file.apply(lambda x: FILE2SECTOR[x])
drop_sectors = [s for s in dtf.sector.unique() if s.startswith('(H)')]
dtf.drop(df[dtf.sector.isin(drop_sectors)].index, inplace=True)
dtf.sector = dtf.sector.astype('category')

dtf.rename(columns={'sentence': 'text','sector':'y'},inplace=True)

In [None]:
import seaborn as sns
from sklearn import feature_selection, metrics
nltk.download('stopwords')
nltk.download('wordnet')

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    '''
    Preprocess a string.
    :parameter
        :param text: string - name of column containing text
        :param lst_stopwords: list - list of stopwords to remove
        :param flg_stemm: bool - whether stemming is to be applied
        :param flg_lemm: bool - whether lemmitisation is to be applied
    :return
        cleaned text
    '''
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text
    
lst_stopwords = nltk.corpus.stopwords.words("english")

dtf["text_clean"] = dtf["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))

In [None]:
## split dataset
dtf_train, dtf_test = model_selection.train_test_split(dtf, test_size=0.2, stratify=dtf.y)
## get target
y_train = dtf_train["y"].values
y_test = dtf_test["y"].values

## Count (classic BoW)
# vectorizer = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

corpus = dtf_train["text_clean"]
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

y = dtf_train["y"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.95
dtf_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    dtf_features = dtf_features.append(pd.DataFrame(
                   {"feature":X_names, "score":1-p, "y":cat}))
    dtf_features = dtf_features.sort_values(["y","score"], 
                    ascending=[True,False])
    dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
X_names = dtf_features["feature"].unique().tolist()

vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

classifier = naive_bayes.MultinomialNB()

## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)
## test
X_test = dtf_test["text_clean"].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

classes = np.unique(y_test)
y_test_array = pd.get_dummies(y_test, drop_first=False).values
    
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, predicted)
auc = metrics.roc_auc_score(y_test, predicted_prob, 
                            multi_class="ovr")
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(y_test, predicted))
    
## Plot confusion matrix
cm = metrics.confusion_matrix(y_test, predicted)
fig, ax = plt.subplots(figsize=(18,7))

sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, 
            cbar=False)
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, 
       yticklabels=classes, title="Confusion matrix")
plt.yticks(rotation=0)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18,8))
## Plot roc
for i in range(len(classes)):
    fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],  
                           predicted_prob[:,i])
    ax[0].plot(fpr, tpr, lw=3, 
              label='{0} (area={1:0.2f})'.format(classes[i], 
                              metrics.auc(fpr, tpr))
               )
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
          xlabel='False Positive Rate', 
          ylabel="True Positive Rate (Recall)", 
          title="Receiver operating characteristic")
ax[0].legend(loc="lower right")
ax[0].grid(True)
    
## Plot precision-recall curve
for i in range(len(classes)):
    precision, recall, thresholds = metrics.precision_recall_curve(
                 y_test_array[:,i], predicted_prob[:,i])
    ax[1].plot(recall, precision, lw=3, 
               label='{0} (area={1:0.2f})'.format(classes[i], 
                                  metrics.auc(recall, precision))
              )
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
          ylabel="Precision", title="Precision-Recall curve")
ax[1].legend(loc="best")
ax[1].grid(True)
plt.show()

In [None]:
!pip install -q lime
from lime import lime_text

## select observation
i = 0
txt_instance = dtf_test["text"].iloc[i]
## check true value and predicted value
print("True:", y_test[i], "--> Pred:", predicted[i], "| Prob:", round(np.max(predicted_prob[i]),2))
## show explanation
explainer = lime_text.LimeTextExplainer(class_names=
             np.unique(y_train))
explained = explainer.explain_instance(txt_instance, 
             model.predict_proba, num_features=3)
explained.show_in_notebook(text=txt_instance, predict_proba=False)

## Word2Vec Visualization

In [None]:
## for word embedding
import gensim
import gensim.downloader as gensim_api

# nlp = gensim_api.load("word2vec-google-news-300") # going to train own model instead

corpus = dtf_train["text_clean"]

## create list of lists of unigrams
lst_corpus = []
for string in corpus:
   lst_words = string.split()
   lst_grams = [" ".join(lst_words[i:i+1]) 
               for i in range(0, len(lst_words), 1)]
   lst_corpus.append(lst_grams)

## detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus, 
                 delimiter=" ".encode(), min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], 
            delimiter=" ".encode(), min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

## fit w2v
nlp = gensim.models.word2vec.Word2Vec(lst_corpus, size=300,   
            window=8, min_count=1, sg=1, iter=30)

word = "health"
fig = plt.figure()
## word embedding
tot_words = [word] + [tupla[0] for tupla in 
                 nlp.most_similar(word, topn=20)]
X = nlp[tot_words]
## pca to reduce dimensionality from 300 to 3
pca = manifold.TSNE(perplexity=40, n_components=3, init='pca')
X = pca.fit_transform(X)
## create dtf
dtf_ = pd.DataFrame(X, index=tot_words, columns=["x","y","z"])
dtf_["input"] = 0
dtf_["input"].iloc[0:1] = 1
## plot 3d
from mpl_toolkits.mplot3d import Axes3D
ax = fig.add_subplot(111, projection='3d')
ax.scatter(dtf_[dtf_["input"]==0]['x'], 
           dtf_[dtf_["input"]==0]['y'], 
           dtf_[dtf_["input"]==0]['z'], c="black")
ax.scatter(dtf_[dtf_["input"]==1]['x'], 
           dtf_[dtf_["input"]==1]['y'], 
           dtf_[dtf_["input"]==1]['z'], c="red")
ax.set(xlabel=None, ylabel=None, zlabel=None, xticklabels=[], 
       yticklabels=[], zticklabels=[])
for label, row in dtf_[["x","y","z"]].iterrows():
    x, y, z = row
    ax.text(x, y, z, s=label)

In [None]:
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

## tokenize text
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', 
                     oov_token="NaN", 
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(lst_corpus)
dic_vocabulary = tokenizer.word_index
## create sequence
lst_text2seq= tokenizer.texts_to_sequences(lst_corpus)
## padding sequence
X_train = kprocessing.sequence.pad_sequences(lst_text2seq, 
                    maxlen=15, padding="post", truncating="post")

sns.heatmap(X_train==0, vmin=0, vmax=1, cbar=False)
plt.show()

i = 0

## list of text: ["I like this", ...]
len_txt = len(dtf_train["text_clean"].iloc[i].split())
print("from: ", dtf_train["text_clean"].iloc[i], "| len:", len_txt)

## sequence of token ids: [[1, 2, 3], ...]
len_tokens = len(X_train[i])
print("to: ", X_train[i], "| len:", len(X_train[i]))

## vocabulary: {"I":1, "like":2, "this":3, ...}
print("check: ", dtf_train["text_clean"].iloc[i].split()[0], 
      " -- idx in vocabulary -->", 
      dic_vocabulary[dtf_train["text_clean"].iloc[i].split()[0]])

print("vocabulary: ", dict(list(dic_vocabulary.items())[0:5]), "... (padding element, 0)")

corpus = dtf_test["text_clean"]

## create list of n-grams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, 
                 len(lst_words), 1)]
    lst_corpus.append(lst_grams)
    
## detect common bigrams and trigrams using the fitted detectors
lst_corpus = list(bigrams_detector[lst_corpus])
lst_corpus = list(trigrams_detector[lst_corpus])
## text to sequence with the fitted tokenizer
lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)

## padding sequence
X_test = kprocessing.sequence.pad_sequences(lst_text2seq, maxlen=15,
             padding="post", truncating="post")

## start the matrix (length of vocabulary x vector size) with all 0s
embeddings = np.zeros((len(dic_vocabulary)+1, 300))
for word,idx in dic_vocabulary.items():
    ## update the row with vector
    try:
        embeddings[idx] =  nlp[word]
    ## if word not in model then skip and the row stays all 0s
    except:
        pass

## code attention layer
def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

## input
x_in = layers.Input(shape=(15,))
## embedding
x = layers.Embedding(input_dim=embeddings.shape[0],  
                     output_dim=embeddings.shape[1], 
                     weights=[embeddings],
                     input_length=15, trainable=False)(x_in)
## apply attention
x = attention_layer(x, neurons=15)
## 2 layers of bidirectional lstm
x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2, 
                         return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2))(x)
## final dense layers
x = layers.Dense(64, activation='relu')(x)
y_out = layers.Dense(3, activation='softmax')(x)
## compile
model = models.Model(x_in, y_out)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
## encode y
dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_train))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
y_train = np.array([inverse_dic[y] for y in y_train])
## train
training = model.fit(x=X_train, y=y_train, batch_size=256, 
                     epochs=10, shuffle=True, verbose=0, 
                     validation_split=0.3)
## plot loss and accuracy
metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
ax[0].set(title="Training")
ax11 = ax[0].twinx()
ax[0].plot(training.history['loss'], color='black')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss', color='black')
for metric in metrics:
    ax11.plot(training.history[metric], label=metric)
ax11.set_ylabel("Score", color='steelblue')
ax11.legend()
ax[1].set(title="Validation")
ax22 = ax[1].twinx()
ax[1].plot(training.history['val_loss'], color='black')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss', color='black')
for metric in metrics:
     ax22.plot(training.history['val_'+metric], label=metric)
ax22.set_ylabel("Score", color="steelblue")
plt.show()

In [None]:
## test
predicted_prob = model.predict(X_test)
predicted = [dic_y_mapping[np.argmax(pred)] for pred in 
             predicted_prob]

## Compare DLI and ICR Embeddings

In [None]:
# report_mean_embeddings = []
for report in report_embs:
    file = report['file']
    report_mean_embedding = np.vstack(report['embedding']).mean(0)
    report_mean_embeddings.append({'file': file, 'embedding': report_mean_embedding})

df = pd.DataFrame(report_mean_embeddings)
df['sector'] = df.file.apply(lambda x: FILE2SECTOR[x])

all_embeddings = np.stack(df.embedding.values)
df = pd.concat([df, pd.DataFrame(project(all_embeddings, dims=3))], axis=1)
df['project'] = df.file.apply(lambda x: FILE2ID[x])

drop_sectors = [s for s in df.sector.unique() if s.startswith('(H)')]
df.drop(df[df.sector.isin(drop_sectors)].index, inplace=True)

In [None]:
dli_embeddings_reduced = pickle.load(open('dli_embeddings_reduced.pkl','rb'))

In [None]:
dli_embs = dli_embeddings_reduced['embedded_dlis']
project_ids = dli_embeddings_reduced['project_ids']
dli_df = pd.DataFrame({'dli_embs': [x for x in dli_embs]}, index = project_ids)

In [None]:
icr_df = df[['project','embedding']].set_index('project')
icr_embs = np.vstack(icr_df.embedding.values)
stacked_embs = np.vstack((dli_embs, icr_embs))
stacked_project_ids = dli_df.index.tolist() + icr_df.index.tolist()
corpus = ['DLI'] * len(dli_embs) + ['ICR'] * len(icr_embs)
# project into same PC space
projected_embs = **project(stacked_embs, dims=3)
df = pd.DataFrame({'project': stacked_project_ids, **projected_embs, 'corpus': corpus})
df['sector'] = df.project.apply(lambda x: ID2SECTOR[x])

In [None]:
# dli_embeddings_reduced = pickle.load(open('dli_embeddings_reduced.pkl','rb'))

# project_ids = dli_embeddings_reduced['project_ids']
# dlis = dli_embeddings_reduced['embedded_dlis']

# PCs = project(dlis, dims=3)

# dli_df = pd.DataFrame({'project': project_ids, **PCs})
# dli_df['sector'] = dli_df.project.apply(lambda x: ID2SECTOR[x])

# drop_sectors = [s for s in dli_df.sector.unique() if s.startswith('(H)')]
# dli_df.drop(dli_df[dli_df.sector.isin(drop_sectors)].index, inplace=True)

# df.set_index('project', inplace=True)
# dli_df.set_index('project', inplace=True)

# df_ = df.join(dli_df[['PC1','PC2','PC3']], lsuffix='_icr', rsuffix='_dli')
# df_.reset_index(inplace=True)

In [None]:
# Plot hist of 2D embeddings
X = icr_embs
pca = PCA(n_components=2)
projections = pca.fit_transform(X)
_ = plt.hist2d(projections[:,0], projections[:,1])

plt.figure()
X = dli_embs
pca = PCA(n_components=2)
projections = pca.fit_transform(X)
_ = plt.hist2d(projections[:,0], projections[:,1])

In [None]:
for corpus, group in df.groupby('corpus'):
  plt.figure()
  group[['PC1','PC2','PC3']].plot.hist(bins=20, alpha=0.5, title=corpus + ' embeddings')

In [None]:
sector_focus = ['Education', 'Health' ,'Water/Sanit/Waste', ]
fig = plotly.subplots.make_subplots(rows=1, cols=1)

for (sector, corpus), group in df.groupby(['sector', 'corpus']):    
    if sector not in sector_focus:
        continue

    fig.add_trace(
        go.Scatter(mode='markers',
                   x=group.PC1, y=group.PC2,
                    #  z=sector_df.PC3_icr,
                   text=group.project,
                   marker=dict(
                       symbol='diamond' if corpus is 'DLI' else 'circle',
#                            opacity=0.5,
    #                          color=2,
                           size=10,
    #                         colorscale='Viridis',
    #                         line_width=1
                   ),
    #                    customdata = np.dstack((sector_df.sector.values, sector_df.report_id.values)),
                   name = sector + '_' + corpus,
                   hovertemplate = '%{text}',
    #                      <br>Report: %{customdata[1]}',
#                          fill="toself",
    #                visible='legendonly'
                    )
        )
    
fig.update_layout(
    height=800,
    # legend_title="Project Sectors",
    title_text=f'World Bank Project DLI vs ICR Embeddings'
)
fig.update_traces(textposition='top center')