# Adapt a code to your purposes
The code we have seen requires some customizations in order to work with our data. 

## Print results instead of writing a new file

 * We create a function called **reconcileDBPedia** that requires two parameters 1) a .csv file and 2) a fieldname where to extract strings to be reconciled against DBPedia.
 * We remove the two lines including the *csv.writer* method, which opens (or creates if does not exist) a .csv file, and the *writerow* method, which writes the header of the file (i.e., fieldnames)
 * we change the name of the .csv file including our data with the variable *csvfile* that we'll pass as an argument to the function. We also specify the encoding standard used in our file
 * we change the fieldname containing names of photographers and we use another variable, *fieldname*, that will be passed to the function
 * we print results instead of writing a new file

In [4]:
import requests , urllib.parse , urllib , csv , re 
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

def reconcileDBPedia(csvfile, fieldname): # def function
    baseURL = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=1&QueryString='
    with open(csvfile, encoding='utf-8') as csvfile: # file as parameter, add encoding
        reader = csv.DictReader(csvfile)
        for row in reader:
            name = str(row[fieldname]) # changed fieldname
            nameDirect = name.strip()[name.find(',')+2:]+' '+name[:name.find(',')]
            nameEdited = urllib.parse.quote(name.encode('utf-8').strip()) # urllib.parse.quote
            url = baseURL+nameEdited.strip()
            response = requests.get(url).content
            record = BeautifulSoup(response, "lxml").find('html').find('body').find('arrayofresult').find('result')
            try:
                label = record.find('label').text.encode('utf-8')
                uri = record.find('uri').text
            except:
                label = ''
                uri = ''
            if name.find(',') != -1:
                ratio = fuzz.ratio(nameDirect, label)
                partialRatio = fuzz.partial_ratio(nameDirect, label)
                tokenSort = fuzz.token_sort_ratio(nameDirect, label)
                tokenSet = fuzz.token_set_ratio(nameDirect, label)
            else:
                ratio = fuzz.ratio(name, label)
                partialRatio = fuzz.partial_ratio(name, label)
                tokenSort = fuzz.token_sort_ratio(name, label)
                tokenSet = fuzz.token_set_ratio(name, label)
                nameDirect = 'N/A'
            avg = (ratio+partialRatio+tokenSort+tokenSet)/4
            # print results
            print([name.strip()]+[label]+[ratio]+[partialRatio]+[tokenSort]+[tokenSet]+[avg]+[uri])

In [5]:
reconcileDBPedia('Authority-Photographers.csv', 'AUFN') # it takes a while

['Fratelli Alinari', b'Fratelli Alinari', 91, 100, 94, 100, 96.25, 'http://dbpedia.org/resource/Fratelli_Alinari']
['Olivo Barbieri', b'Olivo Barbieri', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Olivo_Barbieri']
['Bauer, Robert A.E.', '', 0, 0, 0, 0, 0.0, '']
['Antonio Beato', b'Antonio Beato', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Antonio_Beato']
['Felice Beato', b'Felice Beato', 89, 100, 92, 100, 95.25, 'http://dbpedia.org/resource/Felice_Beato']
['Giacomo Brogi', b'Giacomo Brogi', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Giacomo_Brogi']
['Romano Cagnoni', b'Romano Cagnoni', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Romano_Cagnoni']
['Caprioli', b'Vittorio Caprioli', 57, 100, 59, 100, 79.0, 'http://dbpedia.org/resource/Vittorio_Caprioli']
['Ilario Carposio', b'Ilario Carposio', 91, 100, 94, 100, 96.25, 'http://dbpedia.org/resource/Ilario_Carposio']
['Elio Ciol', b'Elio Ciol', 86, 100, 90, 100, 94.0, 'http://dbpedia.org/resource/Eli

## Distinguish names that have a match and names that do not have a match in results
To highlight matching entities we use an **if statement** on the value of the variable *uri*.

In [10]:
import requests , urllib.parse , urllib , csv , re 
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

def reconcileDBPedia(csvfile, fieldname): # def function
    baseURL = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=1&QueryString='
    with open(csvfile, encoding='utf-8') as csvfile: # file as parameter, add encoding
        reader = csv.DictReader(csvfile)
        for row in reader:
            name = str(row[fieldname]) # changed fieldname
            nameDirect = name.strip()[name.find(',')+2:]+' '+name[:name.find(',')]
            nameEdited = urllib.parse.quote(name.encode('utf-8').strip()) # urllib.parse.quote
            url = baseURL+nameEdited.strip()
            response = requests.get(url).content
            record = BeautifulSoup(response, "lxml").find('html').find('body').find('arrayofresult').find('result')
            try:
                label = record.find('label').text.encode('utf-8')
                uri = record.find('uri').text
            except:
                label = ''
                uri = ''
            if name.find(',') != -1:
                ratio = fuzz.ratio(nameDirect, label)
                partialRatio = fuzz.partial_ratio(nameDirect, label)
                tokenSort = fuzz.token_sort_ratio(nameDirect, label)
                tokenSet = fuzz.token_set_ratio(nameDirect, label)
            else:
                ratio = fuzz.ratio(name, label)
                partialRatio = fuzz.partial_ratio(name, label)
                tokenSort = fuzz.token_sort_ratio(name, label)
                tokenSet = fuzz.token_set_ratio(name, label)
                nameDirect = 'N/A'
            avg = (ratio+partialRatio+tokenSort+tokenSet)/4
            # distinguish results
            if uri != '':
                print('matching URI:', [name.strip().encode('utf-8')]+[nameDirect.encode('utf-8')]+[label]+[ratio]+[partialRatio]+[tokenSort]+[tokenSet]+[avg]+[uri])
            if uri == '':
                print('not matching URI:', [name.strip().encode('utf-8')])

In [11]:
reconcileDBPedia('Authority-Photographers.csv', 'AUFN')

matching URI: [b'Fratelli Alinari', b'N/A', b'Fratelli Alinari', 91, 100, 94, 100, 96.25, 'http://dbpedia.org/resource/Fratelli_Alinari']
matching URI: [b'Olivo Barbieri', b'N/A', b'Olivo Barbieri', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Olivo_Barbieri']
not matching URI: [b'Bauer, Robert A.E.']
matching URI: [b'Antonio Beato', b'N/A', b'Antonio Beato', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Antonio_Beato']
matching URI: [b'Felice Beato', b'N/A', b'Felice Beato', 89, 100, 92, 100, 95.25, 'http://dbpedia.org/resource/Felice_Beato']
matching URI: [b'Giacomo Brogi', b'N/A', b'Giacomo Brogi', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Giacomo_Brogi']
matching URI: [b'Romano Cagnoni', b'N/A', b'Romano Cagnoni', 90, 100, 93, 100, 95.75, 'http://dbpedia.org/resource/Romano_Cagnoni']
matching URI: [b'Caprioli', b'N/A', b'Vittorio Caprioli', 57, 100, 59, 100, 79.0, 'http://dbpedia.org/resource/Vittorio_Caprioli']
matching URI: [b'Ilario Carposio', b'N/A'

# Update the function in order to calculate both recall and precision

To calculate **precision** we need: 
 * the number of all the correct matches (truePositive)
 * the number of retrieved matches (retrievedURIs)

To calculate **recall** we need: 
 * the number of all the correct matches 
 * the number of all the expected results, i.e. the rows of our csv file (rows)
Looking at our results we notice that few of matches are incorrect, e.g. Fumero is not a Cuban American actor...
To confirm that the person we found is exactly who we want, we look into few elements of the retrieved XML for the keyword 'photo'. If the word is found either in the description or in one of the category labels, we should be sure that we found a photographer.

 * We start a counter (*truePositive*) before iterating over the rows of the csv file that will store the number of correct reuslts. Aside this we also declare the counter *retrievedURIs*, that will store the number of all the retrieved matches, regardless they are correct or not, and *rows* that is the length of our .csvfile
 * We use another **try-exception** request: we look into both the elements \['description','label'\] for a precise pattern, i.e., 'photo', that we compile using the module `re`. 
 * We add another variable in the output, called `evidence`. If the pattern is found, `evidence = 'True'`, otherwise `evidence = 'False'`.
 * We increment the counter *retrievedURIs* whenever a match is found
 * We add another **if statement**: only when a match is found and the value of the variable is 'True' we increment the counter *truePositive*
 * we print *precision* and *recall* value by dividing the aforementioned values

In [14]:
import requests , urllib.parse , urllib , csv , re 
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

def reconcileDBPedia(csvfile, fieldname): # def function
    baseURL = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=1&QueryString='
    with open(csvfile, encoding='utf-8') as csvfile: # file as parameter, add encoding
        reader = csv.DictReader(csvfile)
        retrievedURIs, truePositive , rows = 0 , 0 , 0 # start the counter of all the found matches, all the correct matches, all the expected results
        for row in reader:
            rows += 1 # increment the number of rows
            name = str(row[fieldname]) # changed fieldname
            nameDirect = name.strip()[name.find(',')+2:]+' '+name[:name.find(',')]
            nameEdited = urllib.parse.quote(name.encode('utf-8').strip()) # urllib.parse.quote
            url = baseURL+nameEdited.strip()
            response = requests.get(url).content
            record = BeautifulSoup(response, "lxml").find('html').find('body').find('arrayofresult').find('result')
            try:
                label = record.find('label').text.encode('utf-8')
                uri = record.find('uri').text
            except:
                label = ''
                uri = ''
            if name.find(',') != -1:
                ratio = fuzz.ratio(nameDirect, label)
                partialRatio = fuzz.partial_ratio(nameDirect, label)
                tokenSort = fuzz.token_sort_ratio(nameDirect, label)
                tokenSet = fuzz.token_set_ratio(nameDirect, label)
            else:
                ratio = fuzz.ratio(name, label)
                partialRatio = fuzz.partial_ratio(name, label)
                tokenSort = fuzz.token_sort_ratio(name, label)
                tokenSet = fuzz.token_set_ratio(name, label)
                nameDirect = 'N/A'
            avg = (ratio+partialRatio+tokenSort+tokenSet)/4
            
            try:
                evidence = record.find(['description','label'], text = re.compile('photo')).text.encode('utf-8')
                evidence = 'True'
            except:
                evidence = 'False'
                    
            # distinguish results
            if uri != '':
                # add variable evidence
                print('matching URI:', [name.strip().encode('utf-8')]+[nameDirect.encode('utf-8')]+[label]+[ratio]+[partialRatio]+[tokenSort]+[tokenSet]+[avg]+[evidence]+[uri])
                # calculate all the found results and only the correct results
                retrievedURIs += 1
                if evidence == 'True':
                    truePositive += 1
            if uri == '':
                print('not matching URI:', [name.strip().encode('utf-8')])
    # precision is the number of correct results (truePositive) divided by the number of all returned results (retrievedURIs)
    print('precision:', truePositive/retrievedURIs)
    # recall is the number of correct results (truePositive) divided by the number of results that should have been returned (totalRows)
    print('recall:',truePositive/rows)

In [15]:
reconcileDBPedia('Authority-Photographers.csv', 'AUFN')

matching URI: [b'Fratelli Alinari', b'N/A', b'Fratelli Alinari', 91, 100, 94, 100, 96.25, 'True', 'http://dbpedia.org/resource/Fratelli_Alinari']
matching URI: [b'Olivo Barbieri', b'N/A', b'Olivo Barbieri', 90, 100, 93, 100, 95.75, 'True', 'http://dbpedia.org/resource/Olivo_Barbieri']
not matching URI: [b'Bauer, Robert A.E.']
matching URI: [b'Antonio Beato', b'N/A', b'Antonio Beato', 90, 100, 93, 100, 95.75, 'True', 'http://dbpedia.org/resource/Antonio_Beato']
matching URI: [b'Felice Beato', b'N/A', b'Felice Beato', 89, 100, 92, 100, 95.25, 'True', 'http://dbpedia.org/resource/Felice_Beato']
matching URI: [b'Giacomo Brogi', b'N/A', b'Giacomo Brogi', 90, 100, 93, 100, 95.75, 'True', 'http://dbpedia.org/resource/Giacomo_Brogi']
matching URI: [b'Romano Cagnoni', b'N/A', b'Romano Cagnoni', 90, 100, 93, 100, 95.75, 'True', 'http://dbpedia.org/resource/Romano_Cagnoni']
matching URI: [b'Caprioli', b'N/A', b'Vittorio Caprioli', 57, 100, 59, 100, 79.0, 'False', 'http://dbpedia.org/resource/Vitt

# Add a new field DESCRIPTION in results
This is a classic example of data integration. Once we have found similar but richer data, we may want to include some interesting data in ours. We integrate descriptions of photographers.

In the first **try-except** we add the element *description*, providing the path of the element and a default value if the XML does not contain it. Then we add the variable *\[description\]* in the output string. We clean a bit the output for the sake of readability.

In [41]:
import requests , urllib.parse , urllib , csv , re 
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

def reconcileDBPedia(csvfile, fieldname): # def function
    baseURL = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=1&QueryString='
    with open(csvfile, encoding='utf-8') as csvfile: # file as parameter, add encoding
        reader = csv.DictReader(csvfile)
        retrievedURIs, truePositive , rows = 0 , 0 , 0 # start the counter of all the found matches, all the correct matches, all the expected results
        for row in reader:
            rows += 1 # increment the number of rows
            name = str(row[fieldname]) # changed fieldname
            nameDirect = name.strip()[name.find(',')+2:]+' '+name[:name.find(',')]
            nameEdited = urllib.parse.quote(name.encode('utf-8').strip()) # urllib.parse.quote
            url = baseURL+nameEdited.strip()
            response = requests.get(url).content
            record = BeautifulSoup(response, "lxml").find('html').find('body').find('arrayofresult').find('result')
            try:
                label = record.find('label').text.encode('utf-8')
                uri = record.find('uri').text
                description = record.find('description').text
            except:
                label = ''
                uri = ''
                description = 'no description'
            if name.find(',') != -1:
                ratio = fuzz.ratio(nameDirect, label)
                partialRatio = fuzz.partial_ratio(nameDirect, label)
                tokenSort = fuzz.token_sort_ratio(nameDirect, label)
                tokenSet = fuzz.token_set_ratio(nameDirect, label)
            else:
                ratio = fuzz.ratio(name, label)
                partialRatio = fuzz.partial_ratio(name, label)
                tokenSort = fuzz.token_sort_ratio(name, label)
                tokenSet = fuzz.token_set_ratio(name, label)
                nameDirect = 'N/A'
            avg = (ratio+partialRatio+tokenSort+tokenSet)/4
            
            try:
                evidence = record.find(['description','label'], text = re.compile('photo')).text.encode('utf-8')
                evidence = 'True'
            except:
                evidence = 'False'
                    
            # distinguish results
            if uri != '':
                # add variable evidence
                print('MATCH: ', name.strip() , '\nFUZZY RATIO: ' , [ratio]+[partialRatio]+[tokenSort]+[tokenSet]+[avg]+[evidence], '\nURI: ' , uri, '\nDESCRIPTION:' , description)
                # calculate all the found results and only the correct results
                retrievedURIs += 1
                if evidence == 'True':
                    truePositive += 1
            if uri == '':
                print('NOT MATCH:', name.strip())
    # precision is the number of correct results (truePositive) divided by the number of all returned results (retrievedURIs)
    print('\nprecision:', truePositive/retrievedURIs)
    # recall is the number of correct results (truePositive) divided by the number of results that should have been returned (totalRows)
    print('recall:',truePositive/rows)

In [42]:
reconcileDBPedia('Authority-Photographers.csv', 'AUFN')

MATCH:  Fratelli Alinari 
FUZZY RATIO:  [91, 100, 94, 100, 96.25, 'True'] 
URI:  http://dbpedia.org/resource/Fratelli_Alinari 
DESCRIPTION: 
            Fratelli Alinari (now ALINARI 24 ORE SPA) is the world's oldest photographic firm founded in Florence, Italy in 1852. Its archives contains 5.5 million photographs, ranging from daguerreotypes to modern digital photos from around the world.
        
MATCH:  Olivo Barbieri 
FUZZY RATIO:  [90, 100, 93, 100, 95.75, 'True'] 
URI:  http://dbpedia.org/resource/Olivo_Barbieri 
DESCRIPTION: 
            Olivo Barbieri (born 1954 in Carpi, Emilia-Romagna) is an Italian artist and photographer of urban environments. He is recognized for his innovative technique creating miniature still photography from actual landscapes by simulating shallow depth of field via the use of tilt-shift lens photography.
        
NOT MATCH: Bauer, Robert A.E.
MATCH:  Antonio Beato 
FUZZY RATIO:  [90, 100, 93, 100, 95.75, 'True'] 
URI:  http://dbpedia.org/resource/Ant

MATCH:  Enrico Martino 
FUZZY RATIO:  [90, 100, 93, 100, 95.75, 'True'] 
URI:  http://dbpedia.org/resource/Enrico_Martino 
DESCRIPTION: Enrico Martino is an Italian photojournalist.
MATCH:  Tina Modotti 
FUZZY RATIO:  [89, 100, 92, 100, 95.25, 'True'] 
URI:  http://dbpedia.org/resource/Tina_Modotti 
DESCRIPTION: 
            Tina Modotti (August 16 1896 – January 5, 1942) was an Italian photographer, model, actress, and revolutionary political activist.
        
MATCH:  Ugo Mulas 
FUZZY RATIO:  [86, 100, 90, 100, 94.0, 'True'] 
URI:  http://dbpedia.org/resource/Ugo_Mulas 
DESCRIPTION: 
            Ugo Mulas (August 28, 1928 - March 2, 1973) was an Italian photographer noted for his portraits of artists and his street photography. Ugo Mulas began his studies in law in 1948 in Milan, but left to take art courses at the Brera Fine Arts Academy. In 1954 he was asked to cover the Venice Biennale, his first professional assignment. He went on to photograph every Venice Biennale through 1972 