<a href="https://colab.research.google.com/github/oghosa/WorldBankIdeasForAction/blob/master/ideaForAction_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import io
import codecs
from sklearn import feature_extraction
from google.colab import files
#import mpld3

In [4]:
uploaded = files.upload()

Saving PastWinningIdeas.csv to PastWinningIdeas.csv


In [0]:
ideas = pd.read_csv('PastWinningIdeas.csv', encoding = "ISO-8859-1")

In [24]:
ideas['Full Abstract'].head(50)

0     Remittances represent significant financial fl...
1     The goal of the proposal is to provide a pract...
2     Although mining gives governments in many coun...
3     Millions of people in developing countries use...
4     Impact.PH seeks to drive strategic philanthrop...
5     Realizing new modalities for funding developme...
6     Our proposal was first sent to the contest of ...
7     We propose the application of Development Impa...
8     We identify return migrants as an optimal grou...
9     The post-2015 development agenda will be adopt...
10    Industrial energy provision from the power gri...
11    Roughly 10 percent of all research and develop...
12    Our vision: We believe that people are born eq...
13    Global remittances exceed the size of global d...
14    Efforts to support innovation and entrepreneur...
15    Surya addresses the prevalence of diesel-power...
16    There is a growing despair syndrome in Africa ...
17    The project involves the consolidation of 

In [6]:
ideas_compact = ideas.iloc[:49, :13]
ideas_compact.shape

(49, 13)

In [7]:
ideas_compact.head()

Unnamed: 0,Project ID,Year,Year Project Number,Project Name,Document Link,Legitimacy,Website Link,Country,Topic Area,Notes,Full Abstract,Rank,Tech-Solution
0,2015-1,2015.0,1.0,Creating New Microinsurance Products for Remit...,https://openknowledge.worldbank.org/bitstream/...,,,India,"Remittance, \nMicroinsurance",,Remittances represent significant financial fl...,1,
1,2015-2,2015.0,2.0,Innovative PPP (public-private partnership) Mo...,,,,Nigeria,Agricluture,,The goal of the proposal is to provide a pract...,2,
2,2015-3,2015.0,3.0,Decreasing Poverty in the Mining Communities o...,,,,Peru,Mining,,Although mining gives governments in many coun...,3,
3,2015-4,2015.0,4.0,Development Impact BondsThe Power of Particip...,,,,,"Development Impact Bonds (DIBs), \nCooking",- Broad scope to all developing nations that u...,Millions of people in developing countries use...,4,
4,2015-5,2015.0,5.0,Impact.PH: An Initiative to Enhance and Transf...,,,,Philippine,"Data, Infomation","- "" drive strategic philanthropy and become th...",Impact.PH seeks to drive strategic philanthrop...,5,Yes - Online Database


In [8]:
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [0]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [0]:
titles = ideas_compact['Project Name']
abstracts = ideas_compact['Full Abstract']

In [12]:
titles.head()
#abstracts.head()

0    Creating New Microinsurance Products for Remit...
1    Innovative PPP (public-private partnership) Mo...
2    Decreasing Poverty in the Mining Communities o...
3    Development Impact BondsThe Power of Particip...
4    Impact.PH: An Initiative to Enhance and Transf...
Name: Project Name, dtype: object

In [0]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in abstracts:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [14]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 12005 items in vocab_frame


In [15]:
vocab_frame.head()

Unnamed: 0,words
remitt,remittances
repres,represent
signific,significant
financi,financial
flow,flows


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(abstracts) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 370 ms, sys: 11.2 ms, total: 381 ms
Wall time: 388 ms
(49, 85)


In [17]:
#terms is just a list of the features used in the tf-idf matrix. This is a vocabulary
terms = tfidf_vectorizer.get_feature_names()
terms

['access',
 'activ',
 'address',
 'aim',
 'avail',
 'becaus',
 'becom',
 'busi',
 'challeng',
 'communiti',
 'cost',
 'countri',
 'creat',
 'current',
 'custom',
 'develop goal',
 'econom',
 'effect',
 'enabl',
 'encourag',
 'exist',
 'face',
 'financ',
 'financi',
 'focus',
 'fund',
 'global',
 'goal',
 'govern',
 'growth',
 'health',
 'help',
 'high',
 'howev',
 'implement',
 'improv',
 'includ',
 'increas',
 'ing',
 'initi',
 'innov',
 'institut',
 'lack',
 'local',
 'make',
 'mani',
 'market',
 'model',
 'nation',
 'need',
 'new',
 'onli',
 'opportun',
 'organ',
 'past',
 'peopl',
 'percent',
 'platform',
 'popul',
 'potenti',
 'problem',
 'process',
 'product',
 'program',
 'project',
 'propos',
 'provid',
 'region',
 'resourc',
 'sector',
 'secur',
 'servic',
 'social',
 'solut',
 'sourc',
 'support',
 'sustain',
 'sustain develop',
 'technolog',
 'tion',
 'unit',
 'use',
 'work',
 'world',
 'year']

In [0]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)


# Clustering
Here I cluster the projects into 5 groups based on their abstracts using k-means clustering for the grouping and tf-idf for the document similarity scoring.

Here, I define term frequency-inverse document frequency (tf-idf) vectorizer parameters and then convert the abstracts list into a tf-idf matrix.

To get a Tf-idf matrix, first count word occurrences by document. This is transformed into a document-term matrix (dtm). This is also just called a term frequency matrix. An example of a dtm is below.

![document-term matrix (dtm)](http://www.jiem.org/index.php/jiem/article/viewFile/293/252/2402)

Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a document but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the document.

A couple things to note about the parameters I define below:

* **max_df:** this is the maximum frequency within the documents a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the documents it probably cares little meanining (in the context of film synopses)
* **min_idf:** this could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. Here I pass 0.2; the term must be in at least 20% of the document. I found that if I allowed a lower min_df I ended up basing clustering on names--for example "Michael" or "Tom" are names found in several of the movies and the synopses use these names frequently, but the names carry no real meaning.
* **ngram_range:** this just means I'll look at unigrams, bigrams and trigrams. See n-grams

In [19]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 68.7 ms, sys: 0 ns, total: 68.7 ms
Wall time: 69.1 ms


In [0]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [21]:
clusters

[4,
 4,
 1,
 4,
 2,
 1,
 1,
 1,
 3,
 2,
 1,
 3,
 4,
 1,
 2,
 1,
 1,
 2,
 3,
 0,
 3,
 3,
 1,
 2,
 4,
 1,
 3,
 1,
 4,
 4,
 4,
 2,
 4,
 2,
 3,
 1,
 4,
 0,
 0,
 4,
 1,
 4,
 0,
 4,
 2,
 3,
 4,
 3,
 4]

In [22]:
ideas_compact['Cluster'] = clusters
ideas_compact


Unnamed: 0,Project ID,Year,Year Project Number,Project Name,Document Link,Legitimacy,Website Link,Country,Topic Area,Notes,Full Abstract,Rank,Tech-Solution,Cluster
0,2015-1,2015.0,1.0,Creating New Microinsurance Products for Remit...,https://openknowledge.worldbank.org/bitstream/...,,,India,"Remittance, \nMicroinsurance",,Remittances represent significant financial fl...,1,,4
1,2015-2,2015.0,2.0,Innovative PPP (public-private partnership) Mo...,,,,Nigeria,Agricluture,,The goal of the proposal is to provide a pract...,2,,4
2,2015-3,2015.0,3.0,Decreasing Poverty in the Mining Communities o...,,,,Peru,Mining,,Although mining gives governments in many coun...,3,,1
3,2015-4,2015.0,4.0,Development Impact BondsThe Power of Particip...,,,,,"Development Impact Bonds (DIBs), \nCooking",- Broad scope to all developing nations that u...,Millions of people in developing countries use...,4,,4
4,2015-5,2015.0,5.0,Impact.PH: An Initiative to Enhance and Transf...,,,,Philippine,"Data, Infomation","- "" drive strategic philanthropy and become th...",Impact.PH seeks to drive strategic philanthrop...,5,Yes - Online Database,2
5,2015-6,2015.0,6.0,Introducing Internet-Based Funding Mechanisms ...,,,,,"World Bank,\nCrowdfunding","- ""This proposal outlines how the World Bank c...",Realizing new modalities for funding developme...,6,Yes - Crowdfunding,1
6,2015-7,2015.0,7.0,Biodiversity Ambassadors: Strategies for Bette...,,Functional Company,http://www.ecoexperiencias.com/,Mexico,"Biodiversity, \nGovernace",,Our proposal was first sent to the contest of ...,NA - Honourable mention,,1
7,2015-8,2015.0,8.0,Development Impact Bonds: Financing the Treatm...,,,,Nigeria,"Development Impact Bonds (DIBs), \nTropical Di...",,We propose the application of Development Impa...,NA - Honourable mention,,1
8,2015-9,2015.0,9.0,Diaspora Bonds for Small-Business Promotion: I...,,,,,"Return Migrants, Skills Development",,We identify return migrants as an optimal grou...,NA - Honourable mention,,3
9,2015-10,2015.0,10.0,Ensuring Effective Implementation of Domestic ...,,,,,Domestic Resource Mobilization (DRM),- Focus on Domestic Resource Mobilization (DRM),The post-2015 development agenda will be adopt...,NA - Honourable mention,,2


In [23]:
ideas_compact['Cluster'].value_counts() #number of projects per cluster (clusters from 0 to 4)

4    15
1    13
3     9
2     8
0     4
Name: Cluster, dtype: int64

In [24]:
grouped = ideas_compact['Year Project Number'].groupby(ideas_compact['Cluster']) #groupby cluster for aggregation purposes

grouped.mean()

Cluster
0    4.500000
1    8.923077
2    8.875000
3    7.000000
4    5.266667
Name: Year Project Number, dtype: float64

In [57]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #print top 6 words that are nearest to the cluster centroid. 
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    
    print("Cluster %d titles:" % i)
    for name in ideas_compact.loc[ideas_compact['Cluster'].isin([i])]['Project Name'].tolist():
      print(' %s' %name)
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: secure, technology, goal, addressed, communities, sustainable,
Cluster 0 titles:
 OINCS: Using a Collaborative and Community Driven Approach to Citizen Mobility and Security
 Wicked Problems, Simple Solutions, and a New Generation of Changemakers
 The Implementation of Advanced Water Treatment Technology in Aquaculture to Ensure Sustainable Development for World Food Security

 Ensuring Food Safety in China and Beyond through Blockchain Technology


Cluster 1 words: proposal, financing, government, local, projects, using,
Cluster 1 titles:
 Decreasing Poverty in the Mining Communities of the World through the Empowerment of Communities in the Control of Mining Royalty Funds: An Application to the Peruvian Case
 Introducing Internet-Based Funding Mechanisms for World Bank Operations
 Biodiversity Ambassadors: Strategies for Better Governance of Biodiversity through the Participation of Children and Youth with the Expansion of Ecoexperiencias in M