In [1]:
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')

In [3]:
def load_prior_embeds():
    with open("./data/project_text_embeddings.pkl", "rb") as fin:
        embeddings = pickle.load(fin)
    
    return embeddings

In [5]:
def save_embeds(embeddings):
    with open("./data/project_text_embeddings.pkl", "wb") as f_out:
        pickle.dump(embeddings, f_out, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
proj = pd.read_csv('./data/all_wb_projects.csv')
proj['numeric_id'] = proj.id.str[1:].astype(int)
proj['name_plus_pdo'] = proj['project_name'] + ' ' + proj['pdo'].fillna('')
proj = proj.sort_values(by=['numeric_id'])

In [7]:
save_clean = False

if save_clean:
    proj.to_csv('clean_wb_proj_all.csv')

In [8]:
proj.head()

Unnamed: 0,id,regionname,countryname,projectstatusdisplay,project_name,pdo,impagency,cons_serv_reqd_ind,url,boardapprovaldate,...,lendinginstr,envassesmentcategorycode,esrc_ovrl_risk_rate,sector1,sector2,sector3,theme1,theme2,numeric_id,name_plus_pdo
11626,P000001,Africa,Africa,Closed,West Africa Pilot Community-based Natural Reso...,,,,http://projects.worldbank.org/P000001/west-afr...,1995-09-14T00:00:00Z,...,Specific Investment Loan,B,,Sub-National Government,Social Protection,"Other Agriculture, Fishing and Forestry",Rural services and infrastructure,Biodiversity,1,West Africa Pilot Community-based Natural Reso...
10945,P000003,Africa,Africa,Closed,REIMP(CEN.ENV.INFO),,,,http://projects.worldbank.org/P000003/reimpcen...,1997-12-18T00:00:00Z,...,Specific Investment Loan,C,,Other Public Administration,Other Information and Communications Technologies,Social Protection,Land administration and management,Environmental policies and institutions,3,REIMP(CEN.ENV.INFO)
13154,P000010,Africa,Africa,Closed,Regional Development Project (03),,,,http://projects.worldbank.org/P000010/null?lan...,1990-02-01T00:00:00Z,...,Financial Intermediary Loan,B,,Banking Institutions,"Other Agriculture, Fishing and Forestry",Other Transportation,Environmental policies and institutions,Other financial and private sector development,10,Regional Development Project (03)
12588,P000017,Africa,Africa,Closed,Engineering and Technical Assistance Project,,,,http://projects.worldbank.org/P000017/engineer...,1992-05-19T00:00:00Z,...,Technical Assistance Loan,C,,Power,,,Legal institutions for a market economy,Regional integration,17,Engineering and Technical Assistance Project
18015,P000019,Africa,Africa,Dropped,UEMOA Regional Financial Sector Project,,,,http://projects.worldbank.org/P000019/null?lan...,,...,Specific Investment Loan,C,,,,,,,19,UEMOA Regional Financial Sector Project


In [9]:
restore_embeddings = True

if restore_embeddings:
    embeddings = load_prior_embeds()
else:
    project_ids = proj.id.to_numpy()
    title_plus_pdo = proj.name_plus_pdo.to_numpy()
    embeddings = dict(project_ids=project_ids, title_plus_pdo_text=sentences_to_encode)

In [19]:
regen_embeddings = False

if regen_embeddings:
    encoded_sentences = model.encode(title_plus_pdo, show_progress_bar=True)
    embeddings['title_plus_pdo_embed'] = encoded_sentences
    save_embeds(embeddings)

In [18]:
embeddings.keys()

dict_keys(['project_ids', 'title_plus_pdo_text', 'title_plus_pdo_embed'])

In [26]:
ind_df = pd.read_csv('./data/pdo.csv') # called "pdo" but actually has the DLIs

In [27]:
ind_df.head()

Unnamed: 0,proj_id,ind_code,ind_name,ind_uom_id,uom_name,baseline_date,baseline_val_text,progress_date,progress_val_text,tgt_date,...,proj_ind_usage_type_code,proj_short_name,proj_stat_name,apprvl_fy,est_cls_fy,lndng_instr_type_name,cntry_code,cntry_short_name,rgn_name,rgn_code
0,P000216,IND0028585,Number of scholarships for medical studies in ...,UOM0000004,,1996-02-09,47,2005-10-21,40,2005-12-31,...,CI,BI Health & Population II,Closed,1995,2009,IPF,BI,Burundi,Africa East,AFE
1,P000216,IND0028586,Number of hospitals with functioning managemen...,UOM0000004,,1996-02-09,0,2005-10-21,11,2005-12-31,...,CI,BI Health & Population II,Closed,1995,2009,IPF,BI,Burundi,Africa East,AFE
2,P000216,IND0028584,% children receiving polio vaccination.,UOM0000004,,1996-02-09,73%,2005-10-21,76%,2005-12-31,...,CI,BI Health & Population II,Closed,1995,2009,IPF,BI,Burundi,Africa East,AFE
3,P000306,IN00665008,Access to safe water in urban cities covered b...,UOM0000004,,2001-03-20,Level of access to piped water in 1994: 57% of t,2007-12-10,74% of the urban population have access to piped,2007-12-31,...,CI,BF-Ouaga Water Suply (FY01),Closed,2001,2008,IPF,BF,Burkina Faso,Africa West,AFW
4,P000306,IN00665006,Population connected to the water distribution...,UOM0000004,,2001-03-20,"300,000 inhabitants",2007-12-10,"1,042,000 inhabitants have direct access to wate",2007-12-31,...,CI,BF-Ouaga Water Suply (FY01),Closed,2001,2008,IPF,BF,Burkina Faso,Africa West,AFW


In [28]:
ind_df.columns

Index(['proj_id', 'ind_code', 'ind_name', 'ind_uom_id', 'uom_name',
       'baseline_date', 'baseline_val_text', 'progress_date',
       'progress_val_text', 'tgt_date', 'tgt_val_text',
       'proj_ind_usage_type_code', 'proj_short_name', 'proj_stat_name',
       'apprvl_fy', 'est_cls_fy', 'lndng_instr_type_name', 'cntry_code',
       'cntry_short_name', 'rgn_name', 'rgn_code'],
      dtype='object')

In [29]:
ind_df.ind_code.nunique() # okay so basically no replication

14643

In [30]:
len(ind_df)

14659

In [31]:
ind_df = ind_df[["proj_id", "ind_name", "uom_name", "baseline_val_text", "tgt_val_text", "proj_short_name"]]

In [32]:
ind_df["uom_name"] = ind_df["uom_name"].fillna("")

In [33]:
ind_df.head()

Unnamed: 0,proj_id,ind_name,uom_name,baseline_val_text,tgt_val_text,proj_short_name
0,P000216,Number of scholarships for medical studies in ...,,47,60,BI Health & Population II
1,P000216,Number of hospitals with functioning managemen...,,0,11,BI Health & Population II
2,P000216,% children receiving polio vaccination.,,73%,85%,BI Health & Population II
3,P000306,Access to safe water in urban cities covered b...,,Level of access to piped water in 1994: 57% of t,75% of the urban population having access to to,BF-Ouaga Water Suply (FY01)
4,P000306,Population connected to the water distribution...,,"300,000 inhabitants","800,000 inhabitants.",BF-Ouaga Water Suply (FY01)


In [34]:
ind_df.uom_name.value_counts()

                5772
Number          3978
Percentage      3624
Yes/No           344
Amount(USD)      194
Hectare(Ha)      183
Kilometers       152
Metric ton       140
Days              79
Tones/year        74
Megawatt          66
Hours             52
Microgram/m3       1
Name: uom_name, dtype: int64

In [39]:
ind_df["dli_comb_text"] = ind_df.apply(
    lambda row: f"IND: {row['ind_name']}, BASE: {row['baseline_val_text']} {row['uom_name']}, TGT: {row['tgt_val_text']} {row['uom_name']}", 
axis=1)

In [45]:
ind_df['ind_name_length'] = ind_df['ind_name'].str.split().str.len()

In [46]:
ind_df.head()

Unnamed: 0,proj_id,ind_name,uom_name,baseline_val_text,tgt_val_text,proj_short_name,dli_comb_text,ind_name_length
0,P000216,Number of scholarships for medical studies in ...,,47,60,BI Health & Population II,IND: Number of scholarships for medical studie...,16.0
1,P000216,Number of hospitals with functioning managemen...,,0,11,BI Health & Population II,IND: Number of hospitals with functioning mana...,9.0
2,P000216,% children receiving polio vaccination.,,73%,85%,BI Health & Population II,"IND: % children receiving polio vaccination., ...",5.0
3,P000306,Access to safe water in urban cities covered b...,,Level of access to piped water in 1994: 57% of t,75% of the urban population having access to to,BF-Ouaga Water Suply (FY01),IND: Access to safe water in urban cities cove...,10.0
4,P000306,Population connected to the water distribution...,,"300,000 inhabitants","800,000 inhabitants.",BF-Ouaga Water Suply (FY01),IND: Population connected to the water distrib...,9.0


In [47]:
proj_ind_df = ind_df.groupby(['proj_id'], as_index=False).agg(
    num_dlis=('ind_name', 'count'),
    combined_text=('dli_comb_text', lambda x: '<SEP>'.join(x)),
    combined_length=('ind_name_length', 'sum')
)

In [48]:
proj_ind_df.head()

Unnamed: 0,proj_id,num_dlis,combined_text,combined_length
0,P000216,3,IND: Number of scholarships for medical studie...,30.0
1,P000306,4,IND: Access to safe water in urban cities cove...,41.0
2,P000309,4,"IND: Repetition ratio, BASE: N= 13.3% , TGT: N...",28.0
3,P000527,7,IND: Primary level Gross Enrollment Rate (GER)...,78.0
4,P000756,12,IND: Farmers adopting improved agricultural te...,80.0


In [None]:
# encoded_sentences = model.encode(title_plus_pdo, show_progress_bar=True)
# embeddings['title_plus_pdo_embed'] = encoded_sentences
# save_embeds(embeddings)

In [62]:
ind_proj_ids = proj_ind_df.proj_id.to_numpy()
combined_text_arr = proj_ind_df.combined_text.to_numpy()

In [63]:
encoded_combined_dlis = model.encode(combined_text_arr, show_progress_bar=True)

Batches:   0%|          | 0/82 [00:00<?, ?it/s]

In [64]:
encoded_combined_dlis.shape

(2611, 768)

In [65]:
len(proj_ind_df)

2611

In [66]:
embeddings['title_plus_pdo_embed'].shape

(21062, 768)

In [56]:
proj = proj.merge(proj_ind_df, left_on='id', right_on='proj_id', how='left')

In [67]:
dli_embeddings = { "project_ids": ind_proj_ids, "embedded_dlis": encoded_combined_dlis }

In [68]:
with open("./data/dli_embeddings.pkl", "wb") as dli_out:
    pickle.dump(dli_embeddings, dli_out, protocol=pickle.HIGHEST_PROTOCOL)

In [69]:
proj_ind_df.to_csv('./data/dli_metadata.csv', index=False)