In [1]:
import pandas as pd

from IPython.core.display import display, HTML 


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

display(HTML("<style>.container { width:90% !important; }</style>")) #permet d'agrandir les cellules


#### Download the SIRENE database from data.gouv

In [2]:
import requests
import os
import zipfile

# Url of the sirene database on data.gouv
URL = "https://www.data.gouv.fr/fr/datasets/r/4a8befd7-1d05-41cd-9e1e-b0f19c73d73b" 

# Accessing the URL
r = requests.get(URL, allow_redirects=True)

# Specify the path where to store data
PATH = "C:/Users/defra/Desktop/git/DATA/"
# PATH = ""

# Storing data
with open(PATH+'sirene.zip', 'wb') as f:
    f.write(r.content)

# Unzipping data
with zipfile.ZipFile(PATH+'sirene.zip', 'r') as zip_ref:
    zip_ref.extractall(PATH)

#### Data sampling

In [None]:
# Reading extracted CSV (Warning: up to 7Go of RAM memory might be needed)
sirene = pd.read_csv(PATH+'StockEtablissement_utf8.csv', low_memory=False)


In [59]:
# Filtering
# active businesses
sirene = sirene.query("etatAdministratifEtablissement=='A'") 
# named businesses
sirene = sirene[~sirene.denominationUsuelleEtablissement.isna()] 
# retail businesses
sirene = sirene[sirene.activitePrincipaleEtablissement.apply(lambda u:u.startswith('47'))]
# checking for an adress
sirene = sirene[~sirene.libelleCommuneEtablissement.isna()]

In [60]:
# Businesses' adress
sirene["adress"] = (
    sirene.numeroVoieEtablissement.fillna('') + ' ' +
    sirene.typeVoieEtablissement.fillna('') + ' ' +
    sirene.libelleVoieEtablissement.fillna('') + ' ' +
    sirene.codePostalEtablissement.fillna(0).astype(int).astype(str).replace('0','') + ' ' + 
    sirene.libelleCommuneEtablissement.fillna('')
).apply(lambda u: ' '.join(u.split()))


# Building search requests
sirene['request'] = (
    # name of the business
    sirene.denominationUsuelleEtablissement.apply(lambda u:'"'+u+'"') + '+' +
    # city of the business
    sirene.libelleCommuneEtablissement.apply(lambda u: '+'.join(u.split()))
)

In [61]:
# Drawing a sample 
test = sirene.sample(n=5, random_state=0)
test

Unnamed: 0,siren,nic,siret,statutDiffusionEtablissement,dateCreationEtablissement,trancheEffectifsEtablissement,anneeEffectifsEtablissement,activitePrincipaleRegistreMetiersEtablissement,dateDernierTraitementEtablissement,etablissementSiege,nombrePeriodesEtablissement,complementAdresseEtablissement,numeroVoieEtablissement,indiceRepetitionEtablissement,typeVoieEtablissement,libelleVoieEtablissement,codePostalEtablissement,libelleCommuneEtablissement,libelleCommuneEtrangerEtablissement,distributionSpecialeEtablissement,codeCommuneEtablissement,codeCedexEtablissement,libelleCedexEtablissement,codePaysEtrangerEtablissement,libellePaysEtrangerEtablissement,complementAdresse2Etablissement,numeroVoie2Etablissement,indiceRepetition2Etablissement,typeVoie2Etablissement,libelleVoie2Etablissement,codePostal2Etablissement,libelleCommune2Etablissement,libelleCommuneEtranger2Etablissement,distributionSpeciale2Etablissement,codeCommune2Etablissement,codeCedex2Etablissement,libelleCedex2Etablissement,codePaysEtranger2Etablissement,libellePaysEtranger2Etablissement,dateDebut,etatAdministratifEtablissement,enseigne1Etablissement,enseigne2Etablissement,enseigne3Etablissement,denominationUsuelleEtablissement,activitePrincipaleEtablissement,nomenclatureActivitePrincipaleEtablissement,caractereEmployeurEtablissement,adress,name,query,request
23728492,791068182,17,79106818200017,O,2013-02-10,1.0,2018.0,,2020-08-25T11:01:29,True,1,,7,,RUE,DU QUATRE SEPTEMBRE,34500.0,BEZIERS,,,34032,,,,,,,,,,,,,,,,,,,2013-02-10,A,MADAME M,,,MADAME M,47.72A,NAFRev2,O,7 RUE DU QUATRE SEPTEMBRE 34500 BEZIERS,"""MADAME M""","""MADAME M""+BEZIERS","""MADAME M""+BEZIERS"
28544991,844021519,13,84402151900013,O,2018-12-01,,,,2019-11-14T14:01:05,True,1,,6,,RUE,ROGER VAILLANT,91700.0,SAINTE-GENEVIEVE-DES-BOIS,,,91549,,,,,,,,,,,,,,,,,,,2018-12-01,A,,,,BOOTIK ESSONNE.FR,47.82Z,NAFRev2,N,6 RUE ROGER VAILLANT 91700 SAINTE-GENEVIEVE-DE...,"""BOOTIK ESSONNE.FR""","""BOOTIK ESSONNE.FR""+SAINTE-GENEVIEVE-DES-BOIS","""BOOTIK ESSONNE.FR""+SAINTE-GENEVIEVE-DES-BOIS"
2598060,323473884,43,32347388400043,O,2013-06-20,,,,2019-11-14T14:00:27,True,5,LA GEBELINIERE,135,,CHE,DU COTEAU,26750.0,SAINT-PAUL-LES-ROMANS,,,26323,,,,,,,,,,,,,,,,,,,2019-04-20,A,,,,THIERY OUTILLAGE,47.89Z,NAFRev2,N,135 CHE DU COTEAU 26750 SAINT-PAUL-LES-ROMANS,"""THIERY OUTILLAGE""","""THIERY OUTILLAGE""+SAINT-PAUL-LES-ROMANS","""THIERY OUTILLAGE""+SAINT-PAUL-LES-ROMANS"
28882261,849710090,10,84971009000010,O,2019-04-13,,,,2019-10-17T03:55:24,True,1,,125,,RUE,DE MEAUX,75019.0,PARIS 19,,,75119,,,,,,,,,,,,,,,,,,,2019-04-13,A,,,,SUBSTANCES,47.25Z,NAFRev2,N,125 RUE DE MEAUX 75019 PARIS 19,"""SUBSTANCES""","""SUBSTANCES""+PARIS+19","""SUBSTANCES""+PARIS+19"
16905360,493886832,69,49388683200069,O,2016-03-01,11.0,2018.0,,2020-08-25T10:39:53,True,1,,67,,RUE,CHAMP DE L ORME,69100.0,VILLEURBANNE,,,69266,,,,,,,,,,,,,,,,,,,2016-03-01,A,,,,FEN PRO - HM RENOV 26,47.52B,NAFRev2,O,67 RUE CHAMP DE L ORME 69100 VILLEURBANNE,"""FEN PRO - HM RENOV 26""","""FEN PRO - HM RENOV 26""+VILLEURBANNE","""FEN PRO - HM RENOV 26""+VILLEURBANNE"


#### Google Search Results

In [67]:
import selenium.webdriver as webdriver

In [65]:
req = URL+

In [74]:
URL = "https://google.com/search?q="
browser = webdriver.Firefox()
# browser.get(URL+request)

WebDriverException: Message: 'geckodriver' executable needs to be in PATH. 


In [68]:
def get_result(request):
    """ Search result of the request."""
    
    URL = "https://google.com/search?q="
    browser = webdriver.Firefox()
    browser.get(URL+request)
    

In [70]:
get_result('"FEN PRO - HM RENOV 26"+VILLEURBANNE')

WebDriverException: Message: 'geckodriver' executable needs to be in PATH. 


In [66]:
print(req)

https://google.com/search?q="FEN PRO - HM RENOV 26"+VILLEURBANNE


In [55]:
sirene.request

119             "PASSIONNEMENT GLAMOUR"+GREOUX-LES-BAINS
120             "PASSIONNEMENT GLAMOUR"+GREOUX-LES-BAINS
5395                         "LA CITADELLE"+LE+POULIGUEN
6006                  "BOUCHERIE CHANZY"+DIGNE-LES-BAINS
7709                    "CHERINGTON MANAGEMENT"+GUERANDE
                                ...                     
30342363                   "JARDINERIE GRASSOT"+BRIGNAIS
30342451    "CLOSTAN MA FERME VIE MEL CYBELLE"+LYON+7EME
30353379                             "MINELLI"+LYON+8EME
30362112                                 "LE FRIAND"+DAX
30364458                      "CENTRE LECLERC"+CAPBRETON
Name: request, Length: 151298, dtype: object

In [44]:
sirene.head()

Unnamed: 0,siren,nic,siret,statutDiffusionEtablissement,dateCreationEtablissement,trancheEffectifsEtablissement,anneeEffectifsEtablissement,activitePrincipaleRegistreMetiersEtablissement,dateDernierTraitementEtablissement,etablissementSiege,nombrePeriodesEtablissement,complementAdresseEtablissement,numeroVoieEtablissement,indiceRepetitionEtablissement,typeVoieEtablissement,libelleVoieEtablissement,codePostalEtablissement,libelleCommuneEtablissement,libelleCommuneEtrangerEtablissement,distributionSpecialeEtablissement,codeCommuneEtablissement,codeCedexEtablissement,libelleCedexEtablissement,codePaysEtrangerEtablissement,libellePaysEtrangerEtablissement,complementAdresse2Etablissement,numeroVoie2Etablissement,indiceRepetition2Etablissement,typeVoie2Etablissement,libelleVoie2Etablissement,codePostal2Etablissement,libelleCommune2Etablissement,libelleCommuneEtranger2Etablissement,distributionSpeciale2Etablissement,codeCommune2Etablissement,codeCedex2Etablissement,libelleCedex2Etablissement,codePaysEtranger2Etablissement,libellePaysEtranger2Etablissement,dateDebut,etatAdministratifEtablissement,enseigne1Etablissement,enseigne2Etablissement,enseigne3Etablissement,denominationUsuelleEtablissement,activitePrincipaleEtablissement,nomenclatureActivitePrincipaleEtablissement,caractereEmployeurEtablissement,adress,name
119,5450093,33,545009300033,O,2001-02-20,NN,,,2019-11-14T14:00:14,True,7,LE PASSY,,,AV,DES THERMES,4800.0,GREOUX-LES-BAINS,,,4094,,,,,,,,,,,,,,,,,,,2010-12-24,A,PASSIONNEMENT GLAMOUR,,,PASSIONNEMENT GLAMOUR,47.78C,NAFRev2,O,AV DES THERMES 4800 GREOUX-LES-BAINS,"""PASSIONNEMENT GLAMOUR"""
120,5450093,41,545009300041,O,2011-04-12,00,2017.0,,2019-11-14T14:00:14,False,1,,36.0,,RUE,GRANDE,4800.0,GREOUX-LES-BAINS,,,4094,,,,,,,,,,,,,,,,,,,2011-04-12,A,PASSIONNEMENT GLAMOUR,,,PASSIONNEMENT GLAMOUR,47.71Z,NAFRev2,O,36 RUE GRANDE 4800 GREOUX-LES-BAINS,"""PASSIONNEMENT GLAMOUR"""
5395,6773386,54,677338600054,O,1993-03-23,NN,,,2016-09-11T03:30:26,True,15,,17.0,,RUE,GREBES,44510.0,LE POULIGUEN,,,44135,,,,,,,,,,,,,,,,,,,2016-09-01,A,LA CITADELLE,,,LA CITADELLE,47.89Z,NAFRev2,N,17 RUE GREBES 44510 LE POULIGUEN,"""LA CITADELLE"""
6006,6950034,22,695003400022,O,1983-12-31,02,2018.0,4722ZB,2020-08-25T10:10:14,True,5,,18.0,,RUE,DU COLONEL PAYAN,4000.0,DIGNE-LES-BAINS,,,4070,,,,,,,,,,,,,,,,,,,2014-03-11,A,BOUCHERIE CHANZY,,,BOUCHERIE CHANZY,47.22Z,NAFRev2,O,18 RUE DU COLONEL PAYAN 4000 DIGNE-LES-BAINS,"""BOUCHERIE CHANZY"""
7709,7272941,27,727294100027,O,2016-08-01,,,3312ZB,2016-09-06T04:47:05,True,2,,7.0,,RUE,DE BIZIENNE,44350.0,GUERANDE,,,44069,,,,,,,,,,,,,,,,,,,2016-09-05,A,,,,CHERINGTON MANAGEMENT,47.99A,NAFRev2,N,7 RUE DE BIZIENNE 44350 GUERANDE,"""CHERINGTON MANAGEMENT"""


In [51]:
(URL+sirene.name+'+'+).values

array(['https://google.com/search?q="PASSIONNEMENT GLAMOUR"+GREOUX-LES-BAINS',
       'https://google.com/search?q="PASSIONNEMENT GLAMOUR"+GREOUX-LES-BAINS',
       'https://google.com/search?q="LA CITADELLE"+LE+POULIGUEN', ...,
       'https://google.com/search?q="MINELLI"+LYON+8EME',
       'https://google.com/search?q="LE FRIAND"+DAX',
       'https://google.com/search?q="CENTRE LECLERC"+CAPBRETON'],
      dtype=object)

In [42]:
sirene.apply(lambda u:URL+u)

119         https://google.com/search?q="PASSIONNEMENT GLA...
120         https://google.com/search?q="PASSIONNEMENT GLA...
5395               https://google.com/search?q="LA CITADELLE"
6006           https://google.com/search?q="BOUCHERIE CHANZY"
7709        https://google.com/search?q="CHERINGTON MANAGE...
                                  ...                        
30342363     https://google.com/search?q="JARDINERIE GRASSOT"
30342451    https://google.com/search?q="CLOSTAN MA FERME ...
30353379                https://google.com/search?q="MINELLI"
30362112              https://google.com/search?q="LE FRIAND"
30364458         https://google.com/search?q="CENTRE LECLERC"
Name: name, Length: 151433, dtype: object