In [1]:
import json
import pandas as pd
import numpy as np
import re
import mysql.connector 
import ast

# Subset valid SIRENs

SIRENs were filtered according to the following criteria: 
* Date of first registration: either empty or before 1 December 2017
* Date of deregistration either empty or after 1 January 2016
* Company size is neither intermediary nor big (i.e. either small-medium or empty)

In [2]:
def fetch_all(query):
    connection = mysql.connector.connect(host='localhost', database='rncs', user='admin', password='Pg49mkMfF4SuLLbA')
    cursor = connection.cursor(buffered=True)    
    cursor.execute(query)
    full = cursor.fetchall()
    cursor.close()
    connection.close()
    return full

In [3]:
query = '''
    SELECT 
        imr_pm.siren,
        'imr-pm' as origin,
        imr_pm.denomination,
        imr_pm.date_1re_immatriculation,
        imr_pm.date_radiation,
        insee.categorie_entreprise
    FROM
        imr_pm
    LEFT JOIN insee_unite_legale insee ON insee.siren = imr_pm.siren
    WHERE 
        (imr_pm.date_1re_immatriculation < '2017-12-01' OR imr_pm.date_1re_immatriculation IS NULL) AND
        (imr_pm.date_radiation > '2016-01-01' OR imr_pm.date_radiation IS NULL) AND
        insee.categorie_entreprise <> 'ETI' AND
        insee.categorie_entreprise <> 'GE'
    
    UNION

    SELECT 
        imr_pp.siren,
        'imr-pp' as origin,
        imr_pp.nom_patronymique,
        imr_pp.date_1re_immatriculation,
        imr_pp.date_radiation,
        insee.categorie_entreprise
    FROM
        imr_pp
    LEFT JOIN insee_unite_legale insee ON insee.siren = imr_pp.siren
    WHERE 
        (imr_pp.date_1re_immatriculation < '2017-12-01' OR imr_pp.date_1re_immatriculation IS NULL) AND
        (imr_pp.date_radiation > '2016-01-01' OR imr_pp.date_radiation IS NULL) AND
        insee.categorie_entreprise <> 'ETI' AND
        insee.categorie_entreprise <> 'GE'  
'''

result = fetch_all(query)
result_df = pd.DataFrame(result, 
                         columns=['siren', 'origin', 'name', 'date_first_entry', 'date_deregistration', 'cat'])
sirens_for_filtering = np.array(result_df['siren'].unique())
np.save('sirens_for_filtering.npy', sirens_for_filtering)

# Load dictionaries

In [3]:
def load_json(path):
    f = open(path, "r")
    json_loaded = json.load(f)
    f.close()
    return json_loaded

In [5]:
sirens_warcs = load_json("/project/0_cleaning/output_cleaning/sirens_in_warcs.json")
sirets_warcs = load_json("/project/0_cleaning/output_cleaning/sirets_in_warcs.json")
sirens_web = load_json("/project/0_cleaning/output_cleaning/sirens_in_web.json")
sirets_web = load_json("/project/0_cleaning/output_cleaning/sirets_in_web.json")
dics = [
    [sirens_warcs, 0, 'sirens_warcs_filtered'],
    [sirets_warcs, 1, 'sirets_warcs_filtered'],
    [sirens_web, 0, 'sirens_web_filtered'],
    [sirets_web, 1, 'sirets_web_filtered']
]

# Filter by existing SIRENs

In [6]:
def filter_sirs(dic, sirs_off, siret_true): # 0 for siren, 1 for siret
    
    filtered = {}
    
    for i, (domain, sirs) in enumerate(dic.items()):
            
        if siret_true:
            sirs = [sir[:9] for sir in sirs]
        filtered_sirs = [sir for sir in sirs if sir in sirs_off]
        filtered[domain] = filtered_sirs
        
    return filtered

In [7]:
def json_save(obj, name):
    json_content = json.dumps(obj)
    f = open(f"{name}.json","w")
    f.write(json_content)
    f.close()

In [8]:
sirens_off = np.load('sirens_for_filtering.npy')

for i, dic in enumerate(dics):
    dic_filtered = filter_sirs(dic[0], sirens_off, dic[1])
    json_save(dic_filtered, dic[2])
    print(f'{i+1} dictionaries processed.')

1 dictionaries processed.
2 dictionaries processed.
3 dictionaries processed.
4 dictionaries processed.


# Merge dictionaries and assign confidence level for SIRENs

In [4]:
sirens_warcs_filtered = load_json("sirens_warcs_filtered.json")
sirets_warcs_filtered = load_json("sirets_warcs_filtered.json")
sirens_web_filtered = load_json("sirens_web_filtered.json")
sirets_web_filtered = load_json("sirets_web_filtered.json")

sirens_exclude = load_json("sirens-to-exclude.json")
domains_exclude = load_json("domains-to-exclude.json")

domains = list(sirens_warcs_filtered.keys())

In [5]:
def check_domain(domain):
    for exclude in domains_exclude:
        if re.search(exclude, domain):
            return False
    return True

In [8]:
sirens = []

for domain in domains:
    if check_domain(domain):
        domain_sirens_from_sirets_web = list(dict.fromkeys([sir[:9] for sir in sirets_web_filtered[domain] if sir not in sirens_exclude]))
        domain_sirens_web = [sir for sir in sirens_web_filtered[domain] if sir not in sirens_exclude]
        domain_sirens_from_sirets_warcs = list(dict.fromkeys([sir[:9] for sir in sirets_warcs_filtered[domain] if sir not in sirens_exclude]))
        domain_sirens_warcs = [sir for sir in sirens_warcs_filtered[domain] if sir not in sirens_exclude]
        final_sirens = []
        confidence = ''

        sirs_web = list(dict.fromkeys(domain_sirens_from_sirets_web + domain_sirens_web))
        sirs_warcs = list(dict.fromkeys(domain_sirens_from_sirets_warcs + domain_sirens_warcs))

        if len(sirs_web)>0:
            if len(sirs_web)==1:
                final_sirens = sirs_web
                confidence = '***'
                origin = 'web'
            else:
                final_sirens = sirs_web
                confidence = '**'
                origin = 'web'
        elif len(sirs_warcs)>0:
            if len(sirs_warcs)==1:
                final_sirens = sirs_warcs
                confidence = '***'
                origin = 'warc'
            else:
                final_sirens = sirs_warcs
                confidence = '*'
                origin = 'warc'
        else:
            final_sirens = []
            confidence = ''
            origin = ''

        sirens.append({
            'domain': domain,
            'siren': final_sirens,
            'confidence': confidence,
            'origin': origin
        })

# Raw ooutput
sirens_df = pd.DataFrame(sirens)
sirens_df = sirens_df[['confidence', 'domain', 'siren', 'origin']]
sirens_df.to_csv('/project/0_cleaning/output_cleaning/sirens_all.csv', index=False)
sirens_df.head()

Unnamed: 0,confidence,domain,siren,origin
0,***,www.allofamille.fr,[512719188],warc
1,*,www.europages.fr,"[542066717, 520840695, 505015644, 524862661, 5...",warc
2,,www.bibamagazine.fr,[],
3,***,www.easypets.fr,[530772565],warc
4,,www.univeda.fr,[],


# Filter by relevant ecommerce app

In [9]:
domain_wapps = load_json("/project/0_cleaning/output_cleaning/ecom_wapps_per_domain.json")

domains = []
wapps = []
for key, value in domain_wapps.items():
    domains.append(key)
    wapps.append(value)
    
domain_wapps_df = pd.DataFrame()
domain_wapps_df['domain'] = domains
domain_wapps_df['wapps'] = wapps
domain_wapps_df.head()

Unnamed: 0,domain,wapps
0,www.boutique-waterair.fr,[PrestaShop]
1,www.manga-vf.fr,[OpenCart]
2,www.machemer-online.fr,[Magento]
3,www.bougezpourlivg.fr,[OpenCart]
4,www.kalifrais.fr,[Magento]


In [29]:
sirens_incl_wapps = sirens_df.merge(domain_wapps_df)
sirens_incl_wapps.to_excel('sirens_incl_wapps.xlsx')
sirens_incl_wapps.head()

Unnamed: 0,confidence,domain,siren,origin,wapps
0,***,www.allofamille.fr,[512719188],warc,[IBMWebSphereCommerce]
1,*,www.europages.fr,"[542066717, 520840695, 505015644, 524862661, 5...",warc,[OpenCart]
2,,www.bibamagazine.fr,[],,[IBMWebSphereCommerce]
3,***,www.easypets.fr,[530772565],warc,[PrestaShop]
4,,www.univeda.fr,[],,[PrestaShop]


In [14]:
relevant_ecom_wapps = ['PrestaShop','WooCommerce','Magento','OpenCart','Shopify','EPages','DrupalCommerce','Thelia']
relevant_ecom_app = []
for i in range(sirens_incl_wapps.shape[0]):
    wapps = sirens_incl_wapps.iloc[i,4]
    keep = 0
    for wapp in wapps:
        if wapp in relevant_ecom_wapps:
            keep = 1
    relevant_ecom_app.append(keep)

sirens_incl_wapps['keep'] = relevant_ecom_app

In [15]:
sirens_incl_wapps.head()

Unnamed: 0,confidence,domain,siren,origin,wapps,keep
0,***,www.allofamille.fr,[512719188],warc,[IBMWebSphereCommerce],0
1,*,www.europages.fr,"[542066717, 520840695, 505015644, 524862661, 5...",warc,[OpenCart],1
2,,www.bibamagazine.fr,[],,[IBMWebSphereCommerce],0
3,***,www.easypets.fr,[530772565],warc,[PrestaShop],1
4,,www.univeda.fr,[],,[PrestaShop],1


# Save final list of SIREN

In [23]:
# Final list of sirens incl. domain
sirens_final = sirens_incl_wapps.loc[(sirens_incl_wapps['confidence'] == '***') & (sirens_incl_wapps['keep'] == 1),['domain','siren','origin','wapps']]
sirens_final['siren'] = sirens_final['siren'].apply(lambda x: x[0])
sirens_final.to_csv('/project/0_cleaning/output_cleaning/sirens_final.csv', index=False)
sirens_final.to_excel('sirens_final.xlsx', index=False)
# Final list of sirens
sirens = sirens_final['siren'].tolist()
sirens = list(dict.fromkeys(sirens))
np.save('/project/0_cleaning/output_cleaning/sirens.npy', sirens)
print(f'{len(sirens)} unique sirens extracted.')

3573 unique sirens extracted.
