In [1]:
#Dependencies and Packages
import os
import numpy as np
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from nltk.corpus import words
import pymongo

## Load Data from S3

In [2]:
if not os.path.isdir('iati_text'):
    !aws s3 cp s3://mdang.w210/iati_text.txt.gz --region us-west-2 .
    #!tar -zxf iati_text.tar.gz #not tar file
    

download: s3://mdang.w210/iati_text.txt.gz to ./iati_text.txt.gz


### 1. Load the activities data into 209 separate "documents"

In [15]:
#Exploring number of organizations and activities to work with 
t0 = time()
orgs_ids = set()
with open('iati_text.txt', 'r') as myfile:
    lines = myfile.readlines()
    print len(lines)
    for line in lines:
        line = line.split('\t')
        orgs_ids.add(line[0])
print 'Number of orgs reporting ' + str(len(orgs_ids))
print time() - t0

546742
Number of orgs reporting 316
12.2858359814


In [16]:
#New version to use
t0 = time()
corpus = set(words.words()) #create corpus to use 

cur_line = 0
orgs_with_text = {} #dictionary to hold org ids as keys and a list of all text as a value
with open('iati_text.txt', 'r') as infile: #open infile
    lines = infile.readlines()
    for line in lines: #for each line of text in iati text
        if cur_line % 50000 == 0: #sanity check
            print 'Completed ' + str(cur_line) + ' lines'
        cur_id, data = line.split('\t') #split line
        working = orgs_with_text.get(cur_id) #retrieve value 
        if working is None: #if this doesn't exist then initialize a key and an empty list
            orgs_with_text[cur_id] = []
            working = []
        for word in data.split(): #split into word tokens
            if word.lower() in corpus: #check if word in corpus
                working.append(word.lower()) #add to list
        orgs_with_text[cur_id] = working #update dictionary value
        cur_line += 1

print len(orgs_with_text.keys())
print 'time: ' + str(time() - t0)


Completed 0 lines
Completed 50000 lines
Completed 100000 lines
Completed 150000 lines
Completed 200000 lines
Completed 250000 lines
Completed 300000 lines
Completed 350000 lines
Completed 400000 lines
Completed 450000 lines
Completed 500000 lines
316
time: 106.091364145


### 2. Rerun tfidf vectorizer with the new document space

In [17]:
t0 = time()
raw_text = [] #holder for all text, these are words in the corpus
for doc in orgs_with_text.values():
    new_doc = str(doc) #render as a string
    raw_text.append(new_doc)

vectorizer = TfidfVectorizer(stop_words = 'english') #create vectorizer
tf_matrix = vectorizer.fit_transform(raw_text)

print 'vectorized in ' + str(time() - t0)
print tf_matrix.shape


vectorized in 89.598731041
(316, 22557)


# TODO: NOT SURE WHY THIS IS TAKING FOREVER

In [9]:
#Dump files to pickle

with open('vectorizer.pkl', 'wb') as handle:
    pickle.dump(vectorizer, handle)
    

In [10]:
with open('tfMatrix.pkl', 'wb') as handle:
    pickle.dump(tf_matrix, handle)


In [11]:
with open('orgs_text.pkl', 'wb') as handle:
    pickle.dump(orgs_with_text, handle)

### 3. Compute Neighbors based on cosine similarity of documents

In [12]:
#Load files from pickle
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
tf_matrix = pickle.load(open('tfMatrix.pkl', 'rb'))
orgs_with_text = pickle.load(open('orgs_text.pkl', 'rb'))


In [18]:
#Returns the nearest neighbors as sorted by cosine similarity
def getSimilarity(org_id):
    num = None
    for tupl in enumerate(orgs_with_text.keys()):
        if org_id == tupl[1]: #match id
            num = tupl[0] #return number corresponding to row in matrix
            break
    similarity_matrix = cosine_similarity(tf_matrix[num], tf_matrix) #get cosine similarity
    sorted_matrix = np.argsort(similarity_matrix[0]) #sort and get ids
    nums = list(sorted_matrix[-6:])[::-1][1:] #render top number as list and reverse order (most similar is first)
    #in above line reverse and then remove first item which is the same ID
    to_return = [] 
    for num in nums: #take numeric placeholders and translate into ids 
        to_return.append(orgs_with_text.keys()[num])
    return to_return

nums = getSimilarity('NL-KVK-41201644')
print nums

['NI-MIGOB-3602', 'NL-KVK-41149287', 'GB-CHC-1138287', 'GB-CHC-291691', 'GB-CHC-1090745']


In [None]:
['NI-MIGOB-3602', 'NL-KVK-41149287', 'GB-CHC-1138287', 'GB-CHC-291691', 'GB-CHC-1090745']

## Old - Might be useful

In [8]:
#old version
t0 = time()
corpus = set(words.words()) #set of unique words from nltk, rendered to set to make convenient
with open('org_bag.txt', 'w') as outfile: #outfile to write to
    with open('iati_text.txt', 'r') as infile: #reading from infile
        lines = infile.readlines()
        org_counter = 0 #counter for sanity
        for org in orgs_ids: #go through all organizations, grouped by ids
            if org_counter % 50 == 0:
                print 'At organization ' + str(org_counter) #prints out message for sanity
            bag = [] #list to hold found words per organization
            for line in lines[100000:]: #limiting results for now to make faster
                line = line.split('\t') #split on tab
                if line[0] == org: #if the first column matches the current organization ID
                    words = line[1].split() #split into list of words
                    for word in words:
                        if word.lower() in corpus: #for all words check if in our corpus and is a recognized word
                            bag.append(word.lower()) #append if true
            #I am pretty sure this is poorly written and there's an easier way to write this, 
            #I had problems previously
            for word in bag:
                outfile.write(word + ' ')
            outfile.write('\n')
            org_counter += 1

with open('org_bag.txt', 'r') as myfile:
    lines = myfile.readlines()
    print 'have ' + str(len(lines)) + ' lines'
                        
print time() - t0                

At organization 0
At organization 50
At organization 100
At organization 150
At organization 200
At organization 250
At organization 300
have 316 lines
333.677332163


## Natarajan's Version

In [5]:
from collections import defaultdict
import io
from nltk.corpus import words
import re

corpus=set(map(str.lower, words.words()))
token_re=re.compile(r'(?u)\b\w\w+\b')
def tokenize(line):
    for token in token_re.findall(line):
        tokenl=token.lower()
        if tokenl in corpus:
            yield tokenl

vocab2idx={}
idx2vocab=[]
org_ids=defaultdict(set)
with io.open('iati_text.txt', 'rt', encoding='utf-8') as infile: #reading from infile
    for line in infile:
        try:
            id, data = line.split('\t', 1)
            indices=org_ids[id]
            for token in tokenize(data):
                idx=vocab2idx.get(token)
                if not idx:
                    idx=len(vocab2idx)
                    vocab2idx[token]=idx
                    idx2vocab.append(token)
                indices.add(idx)
        except ValueError as e:
            print("Unable to process '" + line.strip() + "': ", e)
def cos_sim(id1, id2):
    s1=org_ids[id1]
    s2=org_ids[id2]
    return len(s1 & s2) ** 2 / (len(s1)*len(s2))

TypeError: descriptor 'lower' requires a 'str' object but received a 'unicode'

In [2]:
# #Examine file structure
# with open('iati_text.txt', 'r') as myfile:
#     lines = myfile.readlines()
#     for i in range(5):
#         print lines[i]
    

In [9]:
print len(data)

587160


In [12]:
#import data and time how long that took
t0 = time()
with open('iati_text.txt', 'r') as myfile:
    data = myfile.readlines()
    
print "imported data in %0.3fs" % (time() - t0)

print data[10000].split()

imported data in 1.490s


In [4]:
t0 = time()
word_count = 0
total_count = 0 
corpus = set(words.words())

for word in data[587].split():
    total_count += 1
    if word.lower() in corpus:
        word_count += 1
print time() - t0
print word_count
print total_count
print float(word_count)/total_count

0.143539905548
42
110
0.381818181818


In [4]:
#TODO check to make sure set is giving the same results
t0 = time()
word_count = 0
total_count = 0
activity_count = 0
corpus = set(words.words())

for activity in data[:100]:
    for word in activity.split():
        total_count += 1
        if word.lower() in corpus:
            word_count += 1
    activity_count += 1

print time() - t0        
print word_count
print total_count
print float(word_count)/total_count

0.159631967545
7306
22421
0.325855225012


_10 cases, words.words()_
114.761632204
455
1298
0.350539291217

_100 cases, words.words()_

2405.8098011
7306
22421
0.325855225012

_100 cases, corpus_
0.159631967545
7306
22421
0.325855225012

To run through the first 100 activities with the words corpus took 2350 seconds

Using a set reduced it to

In [4]:
corpus = set(words.words())
# for item in thelist:
#   thefile.write("%s\n" % item)

with open('englishWords.txt', 'w') as myfile:
    for activity in data:
        english = []
        for word in activity.split():
            if word.lower() in corpus:
                english.append(word.lower())
        for word in english:
            myfile.write(word + ' ')
        myfile.write('\n')
            

# with open('englishWords.txt', 'r') as myfile:
#     lines = myfile.readlines()
#     print len(lines)

In [5]:
#use TFIDF Vectorizer on new data

with open('englishWords.txt', 'r') as myfile:
    inputData = myfile.readlines()
    
t0 = time()
vectorizer = TfidfVectorizer(stop_words = 'english')
tf_matrix = vectorizer.fit_transform(inputData)

print 'vectorized in %0.3fs' % (time() - t0)
print tf_matrix.shape

vectorized in 70.467s
(587160, 23472)


In [6]:
#Save files for later use
with open('vectorizer.pkl', 'wb') as handle:
    pickle.dump(vectorizer, handle)
    
with open('tfMatrix.pkl', 'wb') as handle:
    pickle.dump(tf_matrix, handle)

In [7]:
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

In [8]:
test = 'water projects in department'
newMatrix = vectorizer.transform([test])
newMatrix

<1x23472 sparse matrix of type '<type 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

__Saving and Functions__

In [3]:
#Save tfidf matrix for future use 
#store the content
# with open("x_result.pkl", 'wb') as handle:
#                     pickle.dump(tfidf, handle)

with open('tfidfMatrix.pkl', 'wb') as handle:
    pickle.dump(tf_matrix, handle)

In [3]:
tf_matrix = pickle.load(open('tfidfMatrix.pkl', 'rb'))

In [9]:
#compute cosine similarity of an activity and all other activities and return the number specificed (N) 
#most similar activities provided by user

#Function to find the nearest neighbors based on cosine similarity for items in our list
#Doc_id refers to the index of the item in our data(?) and tfidf matrix
def NearestProjectsCosine(doc_id, num_neighbors = 5):
    similarity_matrix = cosine_similarity(tf_matrix[doc_id:(doc_id+1)], tf_matrix) #get a similarity matrix
    sorted_matrix = np.argsort(similarity_matrix[0]) #sort using np.argsort
    nearest = sorted_matrix[-num_neighbors:] #get the indices with the highest cosine similarity
    print nearest[::-1] #print these

#NearestProjectsCosine(0)

In [15]:
#TODO: Test this new function
# t0 = time()
# vectorizer = TfidfVectorizer(max_df = 0.8, stop_words = 'english', decode_error = 'ignore') #needed for function
# vectorizer.fit(data)
# print "Vectorizer is working in %0.3fs" % (time() - t0)


#Used for a new project that is not in our list
def NewProjectSearch(text, num_neighbors = 5):
    newMatrix = vectorizer.transform([text]) #make new tfidf entry
    similarity_matrix = cosine_similarity(newMatrix, tf_matrix)
    sorted_matrix = np.argsort(similarity_matrix[0])
    nearest = sorted_matrix[-num_neighbors:]
    return nearest[::-1]

activity = "Agricultural projects in Libya taking place from 2003 to 2014"
nums = NewProjectSearch(activity)
print nums
for number in nums:
    print 'Activity ' + str(number)
    print data[number]
    print '\n'
    print '=' * 50
    print '\n'

[471396 429651 124434 540054 124447]
Activity 471396
46002-P-CM-FA0-003	23010 Power DAC 2010-10-15 1 2013-02-25 2 2018-06-30 3 2017-03-04 4 CM 1 2016-12-31 2010-01-01 2186.59 2016-12-31 2016-01-01 0.00 kilometers 1 Length of transmission and distribution lines rehabilitated or installed (km) 1 2016-12-31 2010-01-01 260.38 2016-12-31 2016-01-01 0.00 number 1 Distribution substations and transformers constructed or rehabilitated 1 2016-12-31 2010-01-01 173011.06 2016-12-31 2016-01-01 0.00 Number of people 1 People with a new electricity connection 1 2016-12-31 2010-01-01 2544290.38 2016-12-31 2016-01-01 0.00 Number of people 1 People benefiting from new electricity connections African Development Fund 46003 1 40 The Japan International Cooperation Agency/ Accelerated Cofinancing Facility for Africa JICA ACFA 1 Government Of Cameroon 3 10 ELECTRICITY DEVELOPMENT CORPORATION -EDC 4 en Cameroon - Project to Strengthen and Extend the Electricity Transmission and Distribution Networks fr Came


Original Run: [     0 229910  49041    574 229808]
With Ignore Vectorizer: [587159 195716 195722 195721 195720]

In [38]:
stuff = vectorizer.get_feature_names()
print stuff[249873]

1531310


In [11]:
print data[1]
print '=' * 100
print data[0].split()

US-11-998-70-1037407935	2014-02-11 2 5 USA US-USAGOV 1 10 U.S. Department of State US-11 2 10 U.S. Department of State US-11 3 10 Rp Fed Vendor 4 10 Not Applicable en 1 4 Developing countries, unspecified 1 2014-02-11T00:00:00 0 iati/unitedstates-developing_countries_unspecified_us_department_of_state_3.xml 2016-02-16T23:17:00 http://www.foreignassistance.gov/web/IATI/usg-extension 2.01 US-USAGOV 10 USA Award Description - The description for this item is Administrative Expenses US-11-998-70-1037407935 3 2016-02-16T23:17:00 http://www.foreignassistance.gov General Inquiry 2201 C Street Northwest, Washington, DC 20520 U.S. Department of State +1 202-647-8471 1 foreignassistanceweb@state.gov 2016-02-16T23:17:00 110 C01 iati/unitedstates-developing_countries_unspecified_us_department_of_state_3.xml 1 720 Emergency Response 2 7001 Protection, Assistance and Solutions 99 2014-12-31 2 -179.43 2014-12-31 1143 Migration and Refugee Assistance 19 2014 0 720 Emergency Response 2 7001 Protection,