In [1]:
#This is needed to start a Spark session from the notebook
#You may adjust the memory used by the driver program based on your machine's settings
import os 
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"

from pyspark.sql import SparkSession

In [3]:
# -------------------------------
# Start Spark in LOCAL mode
# -------------------------------

#The following lines are just there to allow this cell to be re-executed multiple times:
#if a spark session was already started, we stop it before starting a new one
#(there can be only one spark context per jupyter notebook)
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass

# Create a new spark session (note, the * indicates to use all available CPU cores)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("demoRDD") \
    .getOrCreate()
    
#When dealing with RDDs, we work the sparkContext object. See https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkContext
sc=spark.sparkContext

Spark application already started. Terminating existing application and starting new one


# Import packages, files

In [4]:
import pandas as pd
import json
inputPath = './data/'

In [6]:
NewOrganization = sc.textFile(inputPath + 'NewOrganization').map(json.loads)

# orgDF

In [7]:
orgDF = pd.DataFrame(NewOrganization.collect())

                                                                                

# Prefix

In [7]:
prefix = []
for i in range(len(orgDF)):
    prefix.append(orgDF['id'].iloc[i][0:15])
    
orgDF['prefix'] = prefix

# Remove rows with NaN pid

In [8]:
orgDF.dropna(subset = ['pid'], inplace = True)
orgDF.reset_index(inplace = True)
orgDF.drop(columns = 'index', inplace = True)

# Extract country's name

In [9]:
orgDF['country'] = orgDF['country'].fillna('unknown')


In [10]:
def getCountry(k):
   return [orgDF['country'][k]['label'] for j in range(len(orgDF['country'][k]))]


countryName = [orgDF['country'][k]['label']  if type(orgDF['country'][k]) == dict else orgDF['country'][k] for k in range(len(orgDF))] 



In [11]:
countryName1 = []

for i in range(len(countryName)):
    countryName1.append(countryName[i].lower())

orgDF['country name'] = countryName1


In [12]:

orgDF = orgDF[['id','prefix','legalname','legalshortname','alternativenames', 'websiteurl', 'country name','pid']]

# Alter lower()

In [13]:
altern = [] 

for i in range(len(orgDF)):
    xi = []
    for x in orgDF['alternativenames'].iloc[i]:
        xi.append(x.lower())
    altern.append(xi)
        
orgDF.loc[:,'alternativenames'] = altern

# Label pid y/n

In [14]:
label = [] 
for i in range(len(orgDF)):
    if  orgDF['pid'].iloc[i] == []:
        label.append('N')
    else:
        label.append('Y')
        
orgDF['pid (Y/N)'] = label

# Legalnames

In [15]:
orgDF['legalname'] = orgDF['legalname'].fillna('unknown')

legalnameLow = [s.lower() for s in orgDF['legalname']]

orgDF['legalname'] = legalnameLow

# pidDF: Dataframe with organizations having pid

In [16]:
PID = [i for i in range(len(orgDF)) if orgDF['pid (Y/N)'].iloc[i] == 'Y']

pidDF = orgDF.iloc[PID]
pidDF.reset_index(inplace = True)
pidDF = pidDF.drop(columns = 'index')

# Add legalnames to alternative names if missing

In [17]:
for i in range(len(pidDF)):
    if pidDF['legalname'].iloc[i] not in pidDF['alternativenames'].iloc[i]:
        (pidDF['alternativenames'].iloc[i]).append(pidDF['legalname'].iloc[i])

# openOrgDF: Dataframe with organizations having a '20|openorgs____' prefix

In [30]:
openOrg = [i for i in range(len(pidDF)) if pidDF.prefix.iloc[i] == '20|openorgs____']

openOrgDF = pidDF.iloc[openOrg]
openOrgDF.reset_index(inplace = True)

# Create dictionaries

In [31]:
def createDix(df):
    allnames = [df['alternativenames'][i] for i in range(len(df))]
    id = [df['id'][i] for i in range(len(df))]
    dix = {allnames[i][j]: id[i] for i in range(len(allnames)) for j in range(len(allnames[i]))}
    return dix


In [32]:
dixPidAll = createDix(pidDF)  # all organizations with a pid

In [33]:
dixOpenOrg = createDix(openOrgDF) # organizations with prefix '20|openorgs____'

# Find organizations

In [34]:
def is_contained(s, w):
    words = s.split() 
    for word in words:
        if word not in w:  
            return False  
    return True  

In [35]:
def findName(name, dix):
    lnames = []
    for x in list(dix.keys()):
        if is_contained(name.lower(), x):
            lnames.append(x)
    return lnames

## Example

In [36]:
findName('University of Athens', dixOpenOrg)

['agricultural university of athens',
 'national technical university of athens',
 'athens university of economics and business',
 'national and kapodistrian university of athens',
 'university of athens']

In [37]:
for x in findName('university of athens', dixOpenOrg):
    display(openOrgDF[openOrgDF['legalname'] == x])

    

Unnamed: 0,index,id,prefix,legalname,legalshortname,alternativenames,websiteurl,country name,pid,pid (Y/N)
13531,23382,20|openorgs____::15ee7f73eb676be60ebc7629e42c7bf9,20|openorgs____,agricultural university of athens,AUA,"[agricultural university of athens, université...",http://www.aua.gr/index.php,greece,"[{'scheme': 'ISNI', 'value': '0000 0001 0794 1...",Y


Unnamed: 0,index,id,prefix,legalname,legalshortname,alternativenames,websiteurl,country name,pid,pid (Y/N)
52543,90211,20|openorgs____::c2e60d624c889b6217c6fc18d811fd72,20|openorgs____,national technical university of athens,NTUA,"[athens polytechnic, école polytechnique d'ath...",http://www.ntua.gr/index_en.html,greece,"[{'scheme': 'ISNI', 'value': '0000 0001 2185 9...",Y


Unnamed: 0,index,id,prefix,legalname,legalshortname,alternativenames,websiteurl,country name,pid,pid (Y/N)
91182,157257,20|openorgs____::2e07a0cb439178cc32c7dcb5912554fe,20|openorgs____,athens university of economics and business,OPA,"[athens university of economics and business, ...",http://www.aueb.gr/index_en.php,greece,"[{'scheme': 'ISNI', 'value': '0000 0001 2179 8...",Y


Unnamed: 0,index,id,prefix,legalname,legalshortname,alternativenames,websiteurl,country name,pid,pid (Y/N)
97442,168002,20|openorgs____::c0286313e36479eff8676dba9b724b40,20|openorgs____,national and kapodistrian university of athens,NKUA,[national and kapodistrian university of athen...,http://en.uoa.gr/,greece,"[{'scheme': 'ISNI', 'value': '0000 0001 2155 0...",Y


Unnamed: 0,index,id,prefix,legalname,legalshortname,alternativenames,websiteurl,country name,pid,pid (Y/N)
