Created on Monday 04 January 2021 

**Group 5 - Identification**
**FNCCR scraper**

@authors: Marianne Manson

# FNCCR

URL utilisé : https://www.fnccr.asso.fr/article/big-data-territorial-publication-de-letude-de-la-fnccr/

Liste des articles: https://www.fnccr.asso.fr/article/page/2/ (Première page : https://www.fnccr.asso.fr/article/)

In [83]:
#import de lib
import pandas as pd
import numpy as np
import re
from requests import get
from bs4 import BeautifulSoup
from datetime import datetime

In [84]:
df_scrap = pd.DataFrame(columns=['art_content','art_content_html','art_extract_datetime','art_lang','art_title','art_url','src_name','src_type','src_url','src_img','art_auth','art_tag']  )
df_scrap

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag


In [85]:
def get_urls_article_fnccr(page_url):
    """Documentation
    
    Parameters:
        page_url: url of the page where articles are listed
    
    Out:
        url_art_list: list with the urls of public articles
    """
    url_art_list = []
    req = get(page_url)
    html_soup = BeautifulSoup(req.text, 'html.parser')
    zone = html_soup.find("div",{'class':"conteneur_a_conteneur_article"})
    art_divs = zone.find_all("div",{'class':"conteneur_article archive_article_type"})
    for art_div in art_divs:
        # Some articles can be accessed only if the user is connected
        link_text = art_div.find("i").text
        is_article_public = (link_text == "lire la suite.") # This works for recent articles
        art_title = art_div.find("h2")
        art_url = art_title.find("a")['href']
        # Another distinction for older articles
        if is_article_public:
            req_art = get(art_url)
            html_soup_art = BeautifulSoup(req_art.text, 'html.parser')
            art_class = html_soup_art.find("article")['class']
            is_article_public = not ("post-password-required" in art_class)
        if is_article_public:
            url_art_list.append(art_url)
    return url_art_list

In [86]:
test_list = get_urls_article_fnccr("https://www.fnccr.asso.fr/article/page/14/")
test_list

['https://www.fnccr.asso.fr/article/loi-anti-gaspillage/',
 'https://www.fnccr.asso.fr/article/biodechets/',
 'https://www.fnccr.asso.fr/article/tarification-incitative/',
 'https://www.fnccr.asso.fr/article/traitement-budgetaire-comptable-des-depenses-liees-a-la-crise-assoupli/']

In [87]:
def get_nb_pages(first_page_url):
    '''Documentation
    
    Parameters:
        first_page_url: url of the first page of the article list
    
    Out:
        nb_pages: number of pages in the article list
    '''
    req = get(first_page_url)
    html_soup = BeautifulSoup(req.text, 'html.parser')
    zone = html_soup.find("main")
    nav_zone = zone.find("div",{'class':"navigation"})
    list_nav = nav_zone.find_all("li")
    last_item = list_nav[-2]
    nb_pages = int(last_item.text)
    return nb_pages

In [88]:
print(get_nb_pages("https://www.fnccr.asso.fr/article/page/1/"))

235


In [89]:
def scraping(url,df):
    '''Documentation
    
    Parameters:
        url: url of the scraped page
        df: dataframe in which data will be put
    
    Out:
        out_df: dataframe with data
    '''
    req = get(url)
    html_soup = BeautifulSoup(req.text, 'html.parser') 
    # content, content_html
    try:
        content = html_soup.find("div",{'class':"contenu_c"})
        content_html = str(content)
        content = content.text
    except:
        content_html = "no_data"
        content = "no_data"
    # date
    date = html_soup.find("time",{'class':"entry-date published"})
    try:
        date = date.text
    except:
        date = str(datetime.today().strftime('%Y-%m-%d')) # if no date is specified, put scraping date
    # tag, title
    presentation = html_soup.find("div",{'class':"prensentation"})
    # tag = presentation.find("p")
    # tag = tag.text
    tag = "no_data" # tags are not always interesting
    title = presentation.find("h1")
    title = title.text
    # Remplissage du dataframe 
    new_row = {'art_content': content ,
               'art_content_html': content_html ,
               'art_extract_datetime': date ,
               'art_lang': 'fr' , 
               'art_title' : title , 
               'art_url' : url ,
               'src_name' :'fnccr'  ,
               'src_type' : 'xpath_source' ,
               'src_url' : 'https://www.fnccr.asso.fr/',
               'src_img' : "no_data" , # No images
               'art_auth': "no_data", # No author specified
               'art_tag': tag}
    out_df = df.append(new_row, ignore_index=True)
    return out_df

In [90]:
df_test = scraping("https://www.fnccr.asso.fr/article/big-data-territorial-publication-de-letude-de-la-fnccr/",df_scrap)
df_test

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,Si les regards se tournent souvent vers les gr...,"<div class=""contenu_c""><p>Si les regards se to...",17 novembre 2016,fr,"""Big data territorial"" : Publication de l'étud...",https://www.fnccr.asso.fr/article/big-data-ter...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data


In [82]:
nb_pages = get_nb_pages("https://www.fnccr.asso.fr/article/page/1/")
url_list = []
for num_page in range(1,nb_pages+1):
    #print(num_page)
    url_page = "https://www.fnccr.asso.fr/article/page/"+str(num_page)+"/"
    page_list = get_urls_article_fnccr(url_page)
    url_list.extend(page_list)

df_test = pd.DataFrame(columns=['art_content','art_content_html','art_extract_datetime','art_lang','art_title','art_url','src_name','src_type','src_url','src_img','art_auth','art_tag'])
for url in url_list:
    #print(url)
    df_test = scraping(url,df_test)
df_test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
https://www.fnccr.asso.fr/article/notre-avenir-energetique-avec-le-gaz-renouvelable/
https://www.fnccr.asso.fr/article/conventions-pilotes-sante-nouvelle-aquitaine/
htt

https://www.fnccr.asso.fr/article/publication-ademe-dechets-chiffres-cles-lessentiel-2019/
https://www.fnccr.asso.fr/article/fonctionnement-des-collectivites-intercommunalites/
https://www.fnccr.asso.fr/article/rcf-tout-savoir-sur-le-bois-energie-en-quelques-questions-reponses/
https://www.fnccr.asso.fr/article/la-fnccr-presente-14-propositions-et-29-travaux-de-proximite/
https://www.fnccr.asso.fr/article/webinaire-collectivite-ou-industrie-meme-combat/
https://www.fnccr.asso.fr/article/info-covid-assainissement-collectif-et-boues/
https://www.fnccr.asso.fr/article/lancement-dactee-2-pour-la-renovation-energetique-des-collectivites/
https://www.fnccr.asso.fr/article/8eme-edition-du-concours-de-laarhse-janvier-2019-juin-2020/
https://www.fnccr.asso.fr/article/info-covid-commande-publique/
https://www.fnccr.asso.fr/article/publication-du-guide-pour-optimiser-les-seuils-de-la-commande-publique/
https://www.fnccr.asso.fr/article/nouvelle-prolongation-de-la-date-limite-de-validilite/
https:

https://www.fnccr.asso.fr/article/la-renovation-energetique-des-collectivites-en-bonne-voie/
https://www.fnccr.asso.fr/article/remise-des-prix-actee/
https://www.fnccr.asso.fr/article/retrouvez-la-fnccr-au-carrefour-de-leau-a-rennes-les-29-et-30-janvier/
https://www.fnccr.asso.fr/article/optimiser-son-reseau-de-chaleur-par-les-outils-numeriques-cest-facile/
https://www.fnccr.asso.fr/article/le-plan-france-tres-haut-debit-est-fortement-menace/
https://www.fnccr.asso.fr/article/economie-circulaire/
https://www.fnccr.asso.fr/article/reglementation-2/
https://www.fnccr.asso.fr/article/tgap/
https://www.fnccr.asso.fr/article/eco-organismes-2/
https://www.fnccr.asso.fr/article/methanisation/
https://www.fnccr.asso.fr/article/commande-publique-4/
https://www.fnccr.asso.fr/article/pjl-engagement-et-proximite-analyse-texte-cmp/
https://www.fnccr.asso.fr/article/teom-2/
https://www.fnccr.asso.fr/article/teom/
https://www.fnccr.asso.fr/article/lutte-contre-le-gaspillage-alimentaire-2/
https://www

https://www.fnccr.asso.fr/article/afnor-dt-dict-enquetes-publiques/
https://www.fnccr.asso.fr/article/principe-de-precaution-et-classification-dun-dechet-dangereux/
https://www.fnccr.asso.fr/article/rappel-des-depenses-prises-en-compte-pour-le-calcul-du-taux-de-la-teom/
https://www.fnccr.asso.fr/article/impossibilite-dinstituer-un-abattement-ou-une-exoneration-de-teom/
https://www.fnccr.asso.fr/article/interdiction-du-transfert-partiel-de-la-competence/
https://www.fnccr.asso.fr/article/marches-publics-les-nouveaux-formulaires-de-la-daj/
https://www.fnccr.asso.fr/article/lettre-juridique-mars-2019/
https://www.fnccr.asso.fr/article/les-chiffres-cles-dechets-pour-lannee-2018/
https://www.fnccr.asso.fr/article/derniers-chiffres-cles-sur-les-dechets/
https://www.fnccr.asso.fr/article/lettre-actualites-avril-2019/
https://www.fnccr.asso.fr/article/contribution-fnccr-territoire-denergie-sur-la-transition-energetique/
https://www.fnccr.asso.fr/article/lettre-dactualite-mars-2019/
https://www

https://www.fnccr.asso.fr/article/rapport-dactivite-de-la-fnccr/
https://www.fnccr.asso.fr/article/3eme-colloque-de-lonpe/
https://www.fnccr.asso.fr/article/modifications-de-la-loi-notre/
https://www.fnccr.asso.fr/article/les-consommateurs-denergie-et-la-transformation-numerique/
https://www.fnccr.asso.fr/article/conclusions-du-gt-solaire-analyse-de-la-fnccr-territoire-denergie/
https://www.fnccr.asso.fr/article/concours-ecoloustics-les-laureats-nationaux-recompenses-a-paris/
https://www.fnccr.asso.fr/article/gt-eclairage-public-05-07-2018/
https://www.fnccr.asso.fr/article/conseil-dorientation-des-eld-20-juin-2018/
https://www.fnccr.asso.fr/article/info-cnil-linky-gazpar-en-date-du-15-juin-2018/
https://www.fnccr.asso.fr/article/aquaplus-2018-cest-parti/
https://www.fnccr.asso.fr/article/la-fnccr-edite-un-guide-operationnel-pour-les-territoires/
https://www.fnccr.asso.fr/article/quatre-ecoles-laureates-du-concours-ecoloustics/
https://www.fnccr.asso.fr/article/gt-solaire-thermique-du-

https://www.fnccr.asso.fr/article/evaluation-des-fournisseurs-gaz-et-elec-par-les-acheteurs/
https://www.fnccr.asso.fr/article/rapport-dactivite-de-la-fnccr-juin-2017/
https://www.fnccr.asso.fr/article/journee-detudes-electricite-et-gaz/
https://www.fnccr.asso.fr/article/synthese-du-debat-autoconsommation-lautre-transition-energetique/
https://www.fnccr.asso.fr/article/quoi-de-neuf-sur-les-directives-europeennes-enr/
https://www.fnccr.asso.fr/article/mounir-mahjoubi-secretaire-detat-charge-du-numerique/
https://www.fnccr.asso.fr/article/comite-technique-des-reseaux-de-chaleur-urbain/
https://www.fnccr.asso.fr/article/les-reseaux-de-chaleur/
https://www.fnccr.asso.fr/article/13359/
https://www.fnccr.asso.fr/article/formations-cycle-eau-2021/
https://www.fnccr.asso.fr/article/mediation-de-leau/
https://www.fnccr.asso.fr/article/petit-dejeuner-debat-autoconsommation/
https://www.fnccr.asso.fr/article/point-sur-les-pcaet/
https://www.fnccr.asso.fr/article/fin-de-la-1ere-annee-dexperimentat

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,Décarboner les bâtiments avec la RE2020\nLes s...,"<div class=""contenu_c""><p><strong>Décarboner l...",21 décembre 2020,fr,Notre avenir énergétique avec le gaz renouvelable,https://www.fnccr.asso.fr/article/notre-avenir...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
1,Le programme ACTEE (Actions des Collectivités ...,"<div class=""contenu_c""><p>Le programme ACTEE (...",2021-01-05,fr,Conventions pilotes Santé - Nouvelle Aquitaine,https://www.fnccr.asso.fr/article/conventions-...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
2,Le programme ACTEE accompagne les collectivité...,"<div class=""contenu_c""><p>Le programme ACTEE a...",2021-01-05,fr,Lancement AMI bâtiments publics médico-sociaux...,https://www.fnccr.asso.fr/article/lancement-am...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
3,"En collaboration avec l’AdCF, la FNCCR, via le...","<div class=""contenu_c""><p>En collaboration ave...",2021-01-05,fr,Webinaire présentation ACTEE - 11 décembre 2020,https://www.fnccr.asso.fr/article/webinaire-pr...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
4,Le rapport annuel de concession constitue le s...,"<div class=""contenu_c""><p>Le rapport annuel de...",2021-01-05,fr,Plan-guide rapport annuel concession chaleur,https://www.fnccr.asso.fr/article/plan-guide-r...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
...,...,...,...,...,...,...,...,...,...,...,...,...
563,"30 services participants, représentant environ...","<div class=""contenu_c""><p>30 services particip...",2021-01-05,fr,Rapports publics de l’analyse comparative des ...,https://www.fnccr.asso.fr/article/rapports-pub...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
564,"46 services participants, représentant environ...","<div class=""contenu_c""><p>46 services particip...",2021-01-05,fr,Rapports publics de l’analyse comparative des ...,https://www.fnccr.asso.fr/article/rapports-pub...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
565,La Lettre des CCSPL n°12 novembre-décembre 201...,"<div class=""contenu_c""><p><a href=""https://www...",2021-01-05,fr,La Lettre des CCSPL – Année 2010,https://www.fnccr.asso.fr/article/la-lettre-de...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
566,La Lettre des CCSPL n°6 novembre-décembre 2009...,"<div class=""contenu_c""><p><a href=""https://www...",2021-01-05,fr,La Lettre des CCSPL – Année 2009,https://www.fnccr.asso.fr/article/la-lettre-de...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
