In [1]:
import re

import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.marches-publics.gouv.fr/?page=entreprise.EntrepriseAdvancedSearch&AllCons'

In [120]:
page_state_regex = '<input type="hidden" name="PRADO_PAGESTATE" id="PRADO_PAGESTATE" value="([a-zA-Z0-9/+=]+)"'
link_regex = r'^https://www\.marches-publics\.gouv\.fr/\?page=entreprise\.EntrepriseDetailConsultation&refConsultation=([\d]+)&orgAcronyme=([\da-z]+)$'
reglement_regex = r'^index\.php\?page=entreprise\.EntrepriseDownloadReglement&reference=([a-zA-Z\d]+)&orgAcronyme=([\da-z]+)$'
boamp_regex = r'^http://www\.boamp\.fr/index\.php/avis/detail/([\d-]+)$'


In [53]:
def extract_links(request_result, regex):
    page = request_result.text
    soup = BeautifulSoup(page, 'html.parser')
    links = soup.find_all('a')
    hrefs = [link.attrs['href'] for link in links if 'href' in link.attrs]
    hrefs_clean = [href for href in hrefs if re.match(regex, href)]
    return hrefs_clean

In [5]:
def init():
    # get page state
    r = requests.get(url)
    assert r.status_code == 200
    page_state = re.search(page_state_regex, r.text).groups()[0]

    # use page with 20 results
    data = {
        'PRADO_PAGESTATE': page_state,
        'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$resultSearch$listePageSizeTop',
        'ctl0$CONTENU_PAGE$resultSearch$listePageSizeTop': 20,
    }
    r = requests.post(url, data=data)
    assert r.status_code == 200
    links = extract_links(r, link_regex)
    page_state = re.search(page_state_regex, r.text).groups()[0]

    return links, page_state

In [10]:
class NoMoreResultsException(Exception):
    pass

def next_page(page_state):
    if not page_state:
        return init()
        
    data = {
        'PRADO_PAGESTATE': page_state,
        'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$resultSearch$PagerTop$ctl2',
    }
    r = requests.post(url, data=data)
    
    if r.status_code == 500:
        raise NoMoreResultsException()
    
    assert r.status_code == 200
    links = extract_links(r, link_regex)
    page_state = re.search(page_state_regex, r.text).groups()[0]

    return links, page_state

# Fetch annonce ids

In [7]:
i = 0

links_by_page = []
page_state = None
try:
    while(True):
        links, page_state = next_page(page_state)
        links_by_page.append(links)

        print(i)
        i += 1
except NoMoreResultsException:
    pass



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114


AssertionError: 

In [13]:
all_links = []
for links in links_by_page:
    all_links += links

In [16]:
assert len(all_links) == len(set(all_links))

# Fetch data for one annonce

In [17]:
link = all_links[0]

In [98]:
annonce_id, org_acronym = re.match(link_regex, link).groups()

In [97]:
url_annonce = 'https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDetailsConsultation&refConsultation={}&orgAcronyme={}'.format(annonce_id, org_acronym)

In [102]:
url_annonce

'https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDetailsConsultation&refConsultation=310484&orgAcronyme=t9y'

In [99]:
r = requests.get(url_annonce)
assert r.status_code == 200
page_state = re.search(page_state_regex, r.text).groups()[0]

## Fetch reglement

In [123]:
links_reglement = extract_links(r, reglement_regex)
assert len(links_reglement) == 1
link_reglement = links_reglement[0]

In [124]:
url_reglement = 'https://www.marches-publics.gouv.fr/' + link_reglement

In [125]:
r_reglement = requests.get(url_reglement)

In [133]:
assert r_reglement.headers['Content-Type'] == 'application/octet-stream'
regex_attachment = r'^attachment; filename="([^"]+)";$'
filename = re.match(regex_attachment, r_reglement.headers['Content-Disposition']).groups()[0]

In [134]:
file_content = r_reglement.raw

## Get textual data from page

In [100]:
links_boamp = extract_links(r, boamp_regex)
unique_boamp = list(set(links_boamp))
assert len(unique_boamp) == 1
link_boamp = unique_boamp[0]
link_boamp

'http://www.boamp.fr/index.php/avis/detail/16-163755'

In [101]:
reference = soup.find(id="ctl0_CONTENU_PAGE_idEntrepriseConsultationSummary_reference").string
intitule = soup.find(id="ctl0_CONTENU_PAGE_idEntrepriseConsultationSummary_intitule").string
objet = soup.find(id="ctl0_CONTENU_PAGE_idEntrepriseConsultationSummary_objet").string

## Get Dossier de Consultation aux Entreprises

In [153]:
url_dce = 'https://www.marches-publics.gouv.fr/index.php?page=entreprise.EntrepriseDemandeTelechargementDce&refConsultation={}&orgAcronyme={}'.format(annonce_id, org_acronym)
r = requests.get(url_dce)
assert r.status_code == 200
page_state = re.search(page_state_regex, r.text).groups()[0]

In [155]:
data = {
    'PRADO_PAGESTATE': page_state,
    'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$validateButton',
    'ctl0$CONTENU_PAGE$EntrepriseFormulaireDemande$RadioGroup': 'ctl0$CONTENU_PAGE$EntrepriseFormulaireDemande$choixAnonyme',
}
r = requests.post(url_dce, data=data)
assert r.status_code == 200
page_state = re.search(page_state_regex, r.text).groups()[0]



In [157]:
data = {
    'PRADO_PAGESTATE': page_state,
    'PRADO_POSTBACK_TARGET': 'ctl0$CONTENU_PAGE$EntrepriseDownloadDce$completeDownload',
}
r = requests.post(url_dce, data=data)
assert r.status_code == 200

AttributeError: 'NoneType' object has no attribute 'groups'

In [160]:
r.headers

{'Content-Type': 'application/zip', 'Pragma': 'public', 'Strict-Transport-Security': 'max-age=31536000', 'Keep-Alive': 'timeout=4, max=100', 'Server': 'Apache', 'Set-Cookie': 'PHPSESSID=lhik29t1o29tgmmtutbvj44rqgl0bnms; path=/; secure', 'Expires': '0', 'Connection': 'Keep-Alive', 'Content-Disposition': 'attachment; filename="DCE V4.zip";', 'Date': 'Sun, 22 Oct 2017 15:57:45 GMT', 'X-S-N': 'm-p-w-4', 'Transfer-Encoding': 'chunked', 'Content-Transfer-Encoding': 'binary', 'Cache-Control': 'must-revalidate, post-check=0, pre-check=0, private'}

In [161]:
assert r.headers['Content-Type'] == 'application/zip'
regex_attachment = r'^attachment; filename="([^"]+)";$'
filename = re.match(regex_attachment, r.headers['Content-Disposition']).groups()[0]