Functions and code to download files from the FTP, unzip the tarballs, and then load and parse the newly downloaded files. Relies on TED not using a different naming scheme for the contents of the tarballs.

It would be better to keep a log of the files which have already been downloaded, but this is quick and easy for now.

In [9]:
from ftplib import FTP
import datetime
import os
import tarfile
import wget
import json
import os
import collections
import io
import numpy as np
import xmltodict
import pandas as pd
import urllib.request
import json

In [10]:
# Function download_files:
# FTPs to ftp_path, gets list of files in the directory for the current year and month (note that this may cause
# problems on the first day of the month downloading the files from the last day of the previous month); makes a list 
# of files to be download. Then downloads the files with wget (ftp was not downloading the entire file); unzips the 
# tarballs and then deletes the tar.gz file.
# 
# Params:
# - data_path -> path to download data to
# - ftp_path -> URI for FTP
# - username, password -> username and password for FTP login
# - year, month -> year and month to download data for, if None will use current month and year
# - max_files -> max number of files to download, useful for debuggin
# - delete_files -> whether to delete the files that have been successfully extracted
#
# Returns:
# - list of files downloaded and successfully extracted
#
# Note that sometimes using the URL throws an error, in this case use the IP address: 91.250.107.123

def download_files(data_path="data", ftp_path="ted.europa.edu", username="guest", password="guest", year=None, month=None, max_files=None, delete_files=True):
    ## USE FTP TO GET THE LIST OF FILES TO DOWNLOAD
    with FTP(ftp_path, user=username, passwd=password) as ftp:
        # create the directory name for the current month and year
        # we may want to do this for yesterday 
        now = datetime.datetime.now()
        if year is None:
            year = str(now.year)
        if month is None:
            month = datetime.datetime.now().strftime('%m')

        # go to that directory and get the files in it
        ftp.cwd('daily-packages/' + year + "/" + month) 
        dir_list = ftp.nlst() 
        files_to_download = []

        # loop through the files
        for file in dir_list:
            file_name = file.split(".")[0]
            end = file_name.split("_")
            end[1] = end[1][4:]
            dir_name = "_".join(end)

            # if the directory does not exist we should download the file
            if not os.path.exists(os.path.join(data_path, dir_name)):
                # download the file with wget since ftplib seems to only download a small part of the file
                file_path = "ftp://"+username+":"+password+"@" + ftp_path + "/daily-packages/" + year + "/" + month + "/" + file
                files_to_download.append(file_path)

    # download the files with wget so we can download the entire file without errors            
    downloaded_files = []
    for i, file in enumerate(files_to_download):
        try:
            print("Downloading", file)
            d_file = wget.download(file, data_path)
            downloaded_files.append(d_file)
        except:
            print("Error downloading", file)
        
        if max_files is not None and i > max_files:
            break
    
    extracted_files = []
    # extract the tarballs
    for file in downloaded_files:
        print("\nExtracting:", file)
        try:
            if (file.endswith("tar.gz")):
                tar = tarfile.open(file, "r:gz")
                tar.extractall(data_path)
                tar.close()
            elif (file.endswith("tar")):
                tar = tarfile.open(file, "r:")
                tar.extractall()
                tar.close()
            
            extracted_files.append(file)
            if delete_files:
                # if everything was properly extracted we can delete the file
                os.remove(file)
            
        except:
            print("Error extracting", file)
            
    return extracted_files

In [11]:
# convert all currencies to EUR
def convert_currencies(values, currencies):
    url = "https://api.exchangeratesapi.io/latest"
    content = urllib.request.urlopen(url).read()
    exchange_rates = json.loads(content.decode())
    results = []
    
    for value, currency in zip(values, currencies):
        if currency == "EUR":
            results.append(value)
            
        else:
            try:
                exchange_rate = exchange_rates['rates'][currency]
                converted_value = float(value) / exchange_rate
                results.append(converted_value)
            # if we don't have a rate for the currency use NaN
            except:
                results.append(np.nan)
                
    return results

def unwind_descriptions(short_desc):
    # get the text from the OrderedDicts in the short descriptions
    for i, foo in enumerate(short_desc):
        if type(foo) != str:
            if type(foo) == list:
                for j, bar in enumerate(foo):
                    if type(bar) == collections.OrderedDict:
                        bar = bar['#text']
                        short_desc[i][j] = bar
            elif type(foo) == collections.OrderedDict:
                foo = foo['#text']
                short_desc[i] = foo

    # flatten the lists
    for i, foo in enumerate(short_desc):
        if type(foo) == list:
            foo = " ".join(foo)
            short_desc[i] = foo
            
    return short_desc

# function to recursively extract data from XML files
def extract_xml(xml_dict, parent_key="", results_dict={}):
    # make sure the input is a an ordered dictionary
    if isinstance(xml_dict, collections.OrderedDict):
        for key1, value1 in xml_dict.items():
            # remove unneeded characters from the key
            if key1[0] == "@" or key1[0] == "#":
                key1 = key1[1:]
                
            # add the parent key for clarity
            if len(parent_key):
                # if the current key is text we will not append it to the parent
                if key1 != "text":
                    new_key = parent_key + "_" + key1
                else:
                    new_key = parent_key
            else:
                new_key = key1
            
            # if the value is a string directly add it
            if isinstance(value1, str):
                # if the key is "P" the value is a new paragraph and should be appended not overwritten
                if key1 != "P":
                    # if the key does NOT exist add it
                    if new_key not in results_dict:
                        results_dict[new_key] = value1
                    # else instead of overwriting the data let's make a list of the values
                    else:
                        if isinstance(results_dict[new_key], list):
                            results_dict[new_key].append(value1)
                        elif isinstance(results_dict[new_key], str):
                            results_dict[new_key] = [results_dict[new_key]]
                            results_dict[new_key].append(value1)
                else:
                    if parent_key in results_dict:
                        results_dict[parent_key] += " " + value1
                    else:
                        results_dict[parent_key] = value1
            
            # else if it is a list loop through and add the items
            # note that this will overwrite the previous entries
            elif isinstance(value1, list):
                for item in value1:
                    results_dict = extract_xml(item, new_key, results_dict)
                    
            # else if the value is an OrderedDict recurse
            elif isinstance(value1, collections.OrderedDict):
                results_dict = extract_xml(value1, new_key, results_dict)
                
    elif isinstance(xml_dict, str):
        results_dict[parent_key] = xml_dict
    
    elif isinstance(xml_dict, list):
        pass
        
    return results_dict

In [12]:
data_path = "./data"

# list of EU country codes
EU_CODES = ["BE", "BG", "CZ", "DK", "DE", "EE", "IE", "EL", "ES", "FR", "HR", "IT", "CY", "LV", "LT", "LU", "HU", "MT", "NL", "AT", "PL", "PT", "RO", "SI", "SK", "FI", "SE", "UK"]

def load_data(files, language="EN", max_dirs=None):
    parsed_xmls = []
    
    language_tenders = []
    all_tenders = []
    
    # clean the file names
    dir_list = []
    for file in files:
        file_name = file.split(".")[0]
        file_array = file_name.split("/")
        file_name = file_array[1]
        split_file_name = file_name.split("_")
        # remove the year from the second part of the split file name
        split_file_name[1] = split_file_name[1][4:]
        dir_name = "_".join(split_file_name)
        dir_list.append(dir_name)
        
    # loop through the files
    for dir_ in dir_list:
        files = os.listdir(os.path.join(data_path, dir_))
        date = dir_.split("_")[0]
        for file in files:
            # read the contents of the file
            with io.open(os.path.join(data_path, dir_, file), 'r', encoding="utf-8") as f:
                xml = f.read()
                parsed_xml = xmltodict.parse(xml)
                parsed_xmls.append(parsed_xml)

                # get some header info
                forms_section = parsed_xml['TED_EXPORT']['FORM_SECTION']
                notice_data = parsed_xml['TED_EXPORT']['CODED_DATA_SECTION']['NOTICE_DATA']

                header_info = {}
                header_info['DATE'] =  parsed_xml['TED_EXPORT']['CODED_DATA_SECTION']['REF_OJS']['DATE_PUB']
                header_info['FILE'] = file
                # extract the info from the codified data section
                header_info = extract_xml(parsed_xml['TED_EXPORT']['CODED_DATA_SECTION']['CODIF_DATA'], "", header_info)

                # extract the info from the notice_data section, except we don't need the URI_LIST
                notice_data.pop("URI_LIST")
                header_info = extract_xml(notice_data, "", header_info)

                if isinstance(notice_data['ORIGINAL_CPV'], list):
                    header_info['ORIGINAL_CPV_CODE'] = []
                    header_info['ORIGINAL_CPV_TEXT'] = []
                    for cpv_info in notice_data['ORIGINAL_CPV']:
                        header_info['ORIGINAL_CPV_CODE'].append(cpv_info['@CODE'])
                        header_info['ORIGINAL_CPV_TEXT'].append(cpv_info['#text'])
                else:
                    header_info['ORIGINAL_CPV_CODE'] = notice_data['ORIGINAL_CPV']['@CODE']
                    header_info['ORIGINAL_CPV_TEXT'] = notice_data['ORIGINAL_CPV']['#text']

                try:
                    header_info['VALUE'] = notice_data['VALUES']['VALUE']['#text']
                    header_info['VALUE_CURR'] = notice_data['VALUES']['VALUE']['@CURRENCY']
                    header_info['REF_NO'] = notice_data['REF_NOTICE']['NO_DOC_OJS']
                except:
                    header_info['VALUE'] = ""
                    header_info['VALUE_CURR'] = ""
                    header_info['REF_NO'] = ""

                forms = forms_section.keys()

                for form in forms:
                    try:
                        form_contents = forms_section[form]

                        if isinstance(form_contents, list):
                            for i, form_content in enumerate(form_contents):
                                all_tenders.append((header_info, form_content))
                                if language is not None and form_content['@LG'] == language:
                                    language_tenders.append((header_info, form_content))
                        elif isinstance(form_contents, collections.OrderedDict):
                            all_tenders.append((header_info, form_contents))
                            if language is not None and form_contents['@LG'] == language:
                                language_tenders.append((header_info, form_contents))
                    except Exception as e:
                        print("File 1", file, e)

    if language == None:
        language_tenders = all_tenders
    
    parsed_data = []
    
    for (header, tender) in language_tenders:
        flattened = {}
        
        # add some fields
        for key in header.keys():
            flattened[key] = header[key]
        
        flattened = extract_xml(tender, "", flattened)
        parsed_data.append(flattened)

    df = pd.DataFrame(parsed_data)
        
    # convert Currencies to Euros
    df['VALUE_EUR'] = convert_currencies(df['VALUE'].values, df['VALUE_CURR'].values)
    
    return df

In [13]:
def load_new_files(data_path="data", ftp_path="91.250.107.123", username="guest", password="guest", year=None, month=None, max_files=None, delete_files=True):
    new_files = download_files(data_path=data_path, ftp_path=ftp_path, username=username, password=password, year=year, month=month, max_files=max_files, delete_files=delete_files)
    
    df = load_data(new_files)
    
    return df

In [14]:
load_new_files(year="2018", month="01", max_files=1)

Downloading ftp://guest:guest@91.250.107.123/daily-packages/2018/01/20180126_2018018.tar.gz
100% [..........................................................................] 7067424 / 7067424Downloading ftp://guest:guest@91.250.107.123/daily-packages/2018/01/20180111_2018007.tar.gz
100% [..........................................................................] 6620219 / 6620219
Extracting: data/20180126_2018018.tar.gz

Extracting: data/20180111_2018007.tar.gz


Unnamed: 0,AA_AUTHORITY_TYPE,AA_AUTHORITY_TYPE_CODE,AC_AWARD_CRIT,AC_AWARD_CRIT_CODE,AWARD_CONTRACT_AWARDED_CONTRACT_CONTRACTORS_CONTRACTOR_ADDRESS_CONTRACTOR_ADDRESS,AWARD_CONTRACT_AWARDED_CONTRACT_CONTRACTORS_CONTRACTOR_ADDRESS_CONTRACTOR_COUNTRY_VALUE,AWARD_CONTRACT_AWARDED_CONTRACT_CONTRACTORS_CONTRACTOR_ADDRESS_CONTRACTOR_E_MAIL,AWARD_CONTRACT_AWARDED_CONTRACT_CONTRACTORS_CONTRACTOR_ADDRESS_CONTRACTOR_FAX,AWARD_CONTRACT_AWARDED_CONTRACT_CONTRACTORS_CONTRACTOR_ADDRESS_CONTRACTOR_NATIONALID,AWARD_CONTRACT_AWARDED_CONTRACT_CONTRACTORS_CONTRACTOR_ADDRESS_CONTRACTOR_OFFICIALNAME,...,VALUES_VALUE_TYPE,VALUE_CURR,VERSION,n2016:CA_CE_NUTS,n2016:CA_CE_NUTS_CODE,n2016:PERFORMANCE_NUTS,n2016:PERFORMANCE_NUTS_CODE,n2016:TENDERER_NUTS,n2016:TENDERER_NUTS_CODE,VALUE_EUR
0,European Institution/Agency or International O...,5,The most economic tender,2,,,,,,,...,ESTIMATED_TOTAL,,,Budapest,HU110,Budapest,HU110,,,
1,European Institution/Agency or International O...,5,The most economic tender,2,,,,,,,...,PROCUREMENT_TOTAL,,,Arr. de Bruxelles-Capitale / Arr. van Brussel-...,BE100,BELGIQUE-BELGIË,BE,"[Friesland (NL), Flevoland]","[NL12, NL23]",
2,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
3,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
4,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
5,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
6,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
7,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
8,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
9,European Institution/Agency or International O...,5,Not specified,Z,,,,,,,...,,,R2.0.8.S03.E01,,,,,,,
