In [1]:
import datetime
import re
from collections import Counter
import traceback
import os

import requests
from bs4 import BeautifulSoup


In [19]:
URL_SEARCH = 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseAdvancedSearch&AllCons'


PAGE_STATE_REGEX = '<input type="hidden" name="PRADO_PAGESTATE" id="PRADO_PAGESTATE" value="([a-zA-Z0-9/+=]+)"'
LINK_REGEX = r'^https://www\.marches-publics\.gouv\.fr/\?page=entreprise\.EntrepriseDetailsConsultation&refConsultation=([\d]+)&orgAcronyme=([\da-z]+)$'
REGLEMENT_REGEX = r'/index.php\?page=entreprise\.EntrepriseDownloadReglement&reference=([a-zA-Z\d]+)&orgAcronyme=([\da-z]+)$'
BOAMP_REGEX = r'^http://www\.boamp\.fr/(?:index\.php/)?avis/detail/([\d-]+)(?:/[\d]+)?$'

In [5]:
from scraper_place import fetch

In [6]:
links = fetch.fetch_current_annonces(nb_pages=2)

In [7]:
links

['https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=426931&orgAcronyme=x7c',
 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=342139&orgAcronyme=a4n',
 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=436933&orgAcronyme=x7c',
 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=310484&orgAcronyme=t9y',
 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=365443&orgAcronyme=s2d',
 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=337330&orgAcronyme=g7h',
 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=339793&orgAcronyme=g7h',
 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=388518&orgAcronym

In [8]:
link_annonce = 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseDetailsConsultation&refConsultation=453610&orgAcronyme=s2d'

In [9]:

annonce_id, org_acronym = re.match(LINK_REGEX, link_annonce).groups()
url_annonce = 'https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDetailsConsultation&refConsultation={}&orgAcronyme={}'.format(annonce_id, org_acronym)

response = requests.get(url_annonce)
assert response.status_code == 200
page_state = re.search(PAGE_STATE_REGEX, response.text).groups()[0]


In [11]:


# Get text data

links_boamp = fetch.extract_links(response, BOAMP_REGEX)
unique_boamp = list(set(links_boamp))
links_boamp = unique_boamp

soup = BeautifulSoup(response.text, 'html.parser')

recap_data = soup.find_all(class_="col-md-10 text-justify")


In [12]:

assert recap_data[0].find('label').text.strip() == "Référence :"
reference = recap_data[0].find('div').text.strip()

assert recap_data[1].find('label').text.strip() == "Intitulé :"
intitule = recap_data[1].find('div').text.strip()

assert recap_data[2].find('label').text.strip() == "Objet :"
objet = recap_data[2].find('div').text.strip()



In [13]:

# Get links to files

publicite_tabs = soup.find_all(id='pub')
assert len(publicite_tabs) == 1
publicite_tab = publicite_tabs[0]
file_links = publicite_tab.find_all('a')

links_reglements = []
links_dces = []
links_avis = []
links_complements = []

for link in file_links:
    link_href = link.attrs['href']

    if re.match(BOAMP_REGEX, link_href):
        continue
    if not link_href:
        continue

    if 'id' not in link.attrs:
        # "liens directs"
        continue

    link_id = link.attrs['id']

    if link_id == 'linkDownloadReglement':
        links_reglements.append(link_href)
    elif link_id == 'linkDownloadDce':
        links_dces.append(link_href)
    elif link_id == 'linkDownloadAvis':
        links_avis.append(link_href)
    elif link_id == 'linkDownloadComplement':
        links_complements.append(link_href)
    elif link_id == 'linkDownloadDume':
        pass  # "DUME acheteur" does not contain useful information
    else:
        raise Exception('Unknown link type {} : {}'.format(link_id, link_href))

assert len(links_reglements) <= 1
link_reglement = links_reglements[0] if links_reglements else None
assert len(links_dces) <= 1
link_dce = links_dces[0] if links_dces else None
# Avis rectificatifs...
# assert len(links_avis) <= 1
link_avis = links_avis[0] if links_avis else None
assert len(links_complements) <= 1
link_complement = links_complements[0] if links_complements else None


def write_response_to_file(annonce_id, org_acronym, filename, file_type, response):
    return 1



In [24]:



# Get Dossier de Consultation aux Entreprises

filename_dce = None
file_size_dce = None
if link_dce:
    url_dce = 'https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDemandeTelechargementDce&refConsultation={}&orgAcronyme={}'.format(annonce_id, org_acronym)
    response_dce = requests.get(url_dce)
    assert response_dce.status_code == 200
    page_state = re.search(PAGE_STATE_REGEX, response_dce.text).groups()[0]
    cookie = response_dce.headers['Set-Cookie']

    data = {
        'PRADO_PAGESTATE': page_state,
        'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$validateButton',
        'ctl0$CONTENU_PAGE$EntrepriseFormulaireDemande$RadioGroup': 'ctl0$CONTENU_PAGE$EntrepriseFormulaireDemande$choixAnonyme',
    }
    response_dce2 = requests.post(url_dce, headers={'Cookie': cookie}, data=data)
    assert response_dce2.status_code == 200
    page_state = re.search(PAGE_STATE_REGEX, response_dce2.text).groups()[0]

    data = {
        'PRADO_PAGESTATE': page_state,
        'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$EntrepriseDownloadDce$completeDownload',
    }
    response_dce3 = requests.post(url_dce, headers={'Cookie': cookie}, data=data, stream=True)
    assert response_dce3.status_code == 200

    content_type = response_dce3.headers['Content-Type']
    assert content_type == 'application/zip', content_type
    regex_attachment = r'^attachment; filename="([^"]+)";$'
    filename_dce = re.match(regex_attachment, response_dce3.headers['Content-Disposition']).groups()[0]

    file_size_dce = write_response_to_file(annonce_id, org_acronym, filename_dce, 'dce', response_dce3)


In [26]:
recap_data = soup.find_all(class_="col-md-10 text-justify")

assert recap_data[0].find('label').text.strip() == "Référence :"
reference = recap_data[0].find('div').text.strip()

assert recap_data[1].find('label').text.strip() == "Intitulé :"
intitule = recap_data[1].find('div').text.strip()

assert recap_data[2].find('label').text.strip() == "Objet :"
objet = recap_data[2].find('div').text.strip()


In [None]:

# Get links to files

publicite_tabs = soup.find_all(id='pub')
assert len(publicite_tabs) == 1
publicite_tab = publicite_tabs[0]
file_links = publicite_tab.find_all('a')

links_reglements = []
links_dces = []
links_avis = []

for link in file_links:
    link_id = link.attrs['id']
    link_href = link.attrs['href']

    if re.match(BOAMP_REGEX, link_href):
        continue
    if not link_href:
        continue

    if link_id == 'linkDownloadReglement':
        links_reglements.append(link_href)
    elif link_id == 'linkDownloadDce':
        links_dces.append(link_href)
    elif link_id == 'linkDownloadAvis':
        links_avis.append(link_href)
    else:
        raise Exception('Unknown link type {} : {}'.format(link_id, link_href))

assert len(links_reglements) <= 1
link_reglement = links_reglements[0] if links_reglements else None
assert len(links_dces) <= 1
link_dce = links_dces[0] if links_dces else None
assert len(links_avis) <= 1
link_avis = links_avis[0] if links_avis else None


def write_response_to_file(annonce_id, org_acronym, filename, file_type, response):
    internal_filepath = build_internal_filepath(annonce_id, org_acronym, filename, file_type)
    with open(internal_filepath, 'wb') as file_object:
        for chunk in response.iter_content(8192):
            file_object.write(chunk)


In [22]:
BOAMP_REGEX = r'^http://www\.boamp\.fr/(?:index\.php/)?avis/detail/([\d-]+)(?:/[\d]+)?$'

re.match(BOAMP_REGEX, 'http://www.boamp.fr/avis/detail/18-57381/0').groups()[0], re.match(BOAMP_REGEX, 'http://www.boamp.fr/index.php/avis/detail/18-57381').groups()[0]


('18-57381', '18-57381')

In [13]:


# Get avis

if not link_avis:
    filename_avis = None
else:
    response_avis = requests.get('https://www.marches-publics.gouv.fr{}'.format(link_avis), stream=True)
    assert response_avis.status_code == 200
    content_type = response_avis.headers['Content-Type']
    assert content_type in {'application/pdf', }, content_type
    regex_attachment = r'^attachment; filename="([^"]+)";'
    filename_avis = re.match(regex_attachment, response_avis.headers['Content-Disposition']).groups()[0]

    write_response_to_file(annonce_id, org_acronym, filename_avis, 'avis', response_avis)



NameError: name 'build_internal_filepath' is not defined

In [14]:
filename_avis

'Accus_ de reception - 18-55788.pdf'

In [55]:
publicite_tabs = soup.find_all(id='pub')
assert len(publicite_tabs) == 1
publicite_tab = publicite_tabs[0]
file_links = publicite_tab.find_all('a')

links_reglements = []
links_dces = []

In [56]:

for link in file_links:
    link_id = link.attrs['id']
    link_href = link.attrs['href']

    if re.match(BOAMP_REGEX, link_href):
        continue
    if link_href == 'https://www.marches-publics.gouv.fr/app.php/consultation/{}'.format(annonce_id):
        continue

    if link_id == 'linkDownloadReglement':
        links_reglements.append(link_href)
    elif link_id == 'linkDownloadDce':
        links_dces.append(link_href)
    else:
        raise Exception('Unknown link type {} : '.format(link_id, link_href))


Exception: Unknown link type linkDownloadComplement : 

In [58]:
file_links

[<a href="https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDemandeTelechargementDce&amp;refConsultation=372248&amp;orgAcronyme=a4n" id="linkDownloadDce">
                                         Dossier de consultation - 822,07 Ko
                                     </a>,
 <a href="" id="linkDownloadComplement">
                                     En savoir plus sur la consultation - 0 Ko
                                 </a>]

In [45]:
link_blocks[1]

<div class="bloc clearfix">
<div class="bloc-title h3">
<label class="p-l-0" for="mps">
<strong>Candidature via le dispositif MPS (Marché Public Simplifié)</strong>
<img alt="Cette consultation est un Marché Public Simplifié" class="inline-img" src="https://www.marches-publics.gouv.fr/bundles/atexoconsultationdepot/assets/images/logo-mps-small.png?k=5ad7ac848f7ab9.24989714"/>
</label>
</div>
<div>Ce mode de candidature permet de répondre à la consultation en complétant en ligne un formulaire de candidature simplifié pré-rempli, avant de joindre son offre sans nécessairement la signer.</div>
<a class="arrow-link arrow-left" href="http://www.modernisation.gouv.fr/les-services-publics-se-simplifient-et-innovent/par-des-simplifications-pour-les-entreprises/marche-public-simplifie" target="_blank" title="Lien externe">
                            En savoir plus
                        </a>
</div>

In [23]:

assert len(link_blocks) == 1
link_block = link_blocks[0]
file_links = link_block.find_all('a')

reglements = []
dces = []
complements = []

for link in file_links:
    link_id = link.attrs['id']
    link_href = link.attrs['href']
    
    if re.match(BOAMP_REGEX, link_href):
        print('boamp:')
        print(link_href)
        continue
    
    if link_id == 'linkDownloadReglement':
        reglements.append(link_href)
    elif link_id == 'linkDownloadDce':
        dces.append(link_href)
    elif link_id == 'linkDownloadComplement':
        complements.append(link_href)
    else:
        raise Exception('Unknown link type {}'.format(link_id))
        
        
reglements, dces, complements

boamp:
http://www.boamp.fr/index.php/avis/detail/17-182893


(['https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDownloadReglement&reference=MzQyMTM5&orgAcronyme=a4n'],
 ['https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDemandeTelechargementDce&refConsultation=342139&orgAcronyme=a4n'],
 [])

In [31]:
reglements[0] if reglements else None

'https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDownloadReglement&reference=MzQyMTM5&orgAcronyme=a4n'

In [24]:

url_dce = 'https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDemandeTelechargementDce&refConsultation={}&orgAcronyme={}'.format(annonce_id, org_acronym)
response_dce = requests.get(url_dce)
assert response_dce.status_code == 200
page_state = re.search(PAGE_STATE_REGEX, response_dce.text).groups()[0]


In [26]:

data = {
    'PRADO_PAGESTATE': page_state,
    'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$validateButton',
    'ctl0$CONTENU_PAGE$EntrepriseFormulaireDemande$RadioGroup': 'ctl0$CONTENU_PAGE$EntrepriseFormulaireDemande$choixAnonyme',
}
response_dce2 = requests.post(url_dce, data=data)
assert response_dce2.status_code == 200
page_state = re.search(PAGE_STATE_REGEX, response_dce2.text).groups()[0]

data = {
    'PRADO_PAGESTATE': page_state,
    'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$EntrepriseDownloadDce$completeDownload',
}
response_dce3 = requests.post(url_dce, data=data, stream=True)
assert response_dce3.status_code == 200

content_type = response_dce3.headers['Content-Type']
assert content_type == 'application/zip', content_type
regex_attachment = r'^attachment; filename="([^"]+)";$'
filename_dce = re.match(regex_attachment, response_dce3.headers['Content-Disposition']).groups()[0]


In [27]:
filename_dce

'DCE-DAE_2017_SAD_INFRA-16012018.zip'