In [None]:
import pandas as pd
from pandarallel import pandarallel
from abbreviations import schwartz_hearst
import time
import ftfy
import requests
import requests_cache
from rich.traceback import install
from tqdm import tqdm
from dotenv import dotenv_values
config = dotenv_values(".env")

tqdm.pandas()
pandarallel.initialize(progress_bar=True)
requests_cache.install_cache('enrich_cache', backend='sqlite')
install(show_locals=True)


subscription_key = config["BING_SUBSCRIPTION_KEY"]
search_url = "https://api.bing.microsoft.com/v7.0/search"

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df = pd.read_csv('ref-rna-real-mars-2022.csv')

In [3]:
# Plusieurs titres continennent le nom de l'association et l'abbréviation entre parenthèses ou pas
df[df.titre.str.contains("\(")].head(2)

Unnamed: 0.1,Unnamed: 0,id,titre,objet,adrs_numvoie,adrs_typevoie,adrs_libvoie,adrs_codepostal,adrs_libcommune,siteweb,adrs,dept,region,social_object1_libelle,social_object2_libelle,longitude,latitude
13,13,W751172035,CONSEIL DES CAMEROUNAIS DE LA DIASPORA (CCD),oeuvrer pour l'instauration d'une société démo...,77 bis,RUE,Marcadet,75018.0,Paris,,77 bis RUE Marcadet 75018 Paris,Paris,Île-de-France,ACTIVITÉS POLITIQUES,AUTRES,2.346887,48.890284
14,14,W751173780,COMMUNAUTE DES ENFANTS DE LOUM EN FRANCE (CELFRA),"promouvoir la solidarité, l'entente et le part...",44,RUE,Piat,75020.0,Paris,,44 RUE Piat 75020 Paris,Paris,Île-de-France,INTERVENTIONS SOCIALES,AUTRES,2.384507,48.872674


In [7]:
def enrich(site, name):
    
    #time.sleep(1)

    name = ftfy.fix_text(name)  # enlever les \
    if "(" in name:
        # L' algorithme de schwartz_hearst sépare le texte en 2 parties { "abbréviation": "texte sans abbréviation" }
        # Cependant il ne fontionne que si l'abbréviation est entre parenthèses et après le nom non abrégé
        # Il ne fonctionne donc pas si l'abbréviation est avant celui-ci et dans le cas ou il n'y a pas de parenthèses
        pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=name)
        # print(pairs)
        if len(pairs) == 1:
            name = list(pairs.values())[0]

    # inspired from https://github.com/Azure-Samples/cognitive-services-REST-api-samples/blob/master/python/Search/BingWebSearchv7.py
    search_term = f'{name} site:{site}'
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}
    params = {"q": search_term, "textDecorations": True, "textFormat": "HTML", "mkt": "fr-FR"}
    response = requests.get(search_url, headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()

    return search_results['webPages']['value'][0]['url'] if 'webPages' in search_results else "not found"

In [5]:
df['facebook_url'] = df.parallel_apply(lambda row: enrich("facebook.com", 
                                       row["titre"]), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=357), Label(value='0 / 357'))), HB…

In [8]:
df["facebook_url"].describe()

count          1071
unique          945
top       not found
freq             14
Name: facebook_url, dtype: object

In [9]:
df["facebook_url"].head(100)

0     https://fr-fr.facebook.com/Association-Dor%C3%...
1     https://www.facebook.com/CCMagdeburg/videos/tr...
2     https://fr-fr.facebook.com/Institut-Universita...
3     https://www.facebook.com/AJCD-Association-des-...
4     https://fr-fr.facebook.com/Association-des-res...
                            ...                        
95           https://www.facebook.com/public/Adna-Bassa
96      https://www.facebook.com/atelierprintemps/posts
97    https://www.facebook.com/Solidarit%C3%A9-Pour-...
98             https://www.facebook.com/LeMontCameroun/
99    https://www.facebook.com/Amazones-Renaissance-...
Name: facebook_url, Length: 100, dtype: object

In [12]:
df['helloasso_url'] = df.parallel_apply(lambda row: enrich("helloasso.com", 
                                          row["titre"]), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=179), Label(value='0 / 179'))), HB…

In [14]:
df["helloasso_url"].describe()

count          1071
unique          658
top       not found
freq             69
Name: helloasso_url, dtype: object

In [15]:
df["helloasso_url"].head(10)

0    https://www.helloasso.com/associations/associa...
1    https://www.helloasso.com/associations/cercle-...
2    https://www.helloasso.com/associations/ladcc/c...
3    https://www.helloasso.com/associations/feiac-f...
4    https://www.helloasso.com/associations/union-d...
5    https://www.helloasso.com/associations/solidar...
6    https://www.helloasso.com/associations/entraid...
7    https://www.helloasso.com/associations/amitie-...
8    https://www.helloasso.com/associations/asso-me...
9    https://www.helloasso.com/associations/union-d...
Name: helloasso_url, dtype: object

In [16]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,id,titre,objet,adrs_numvoie,adrs_typevoie,adrs_libvoie,adrs_codepostal,adrs_libcommune,siteweb,adrs,dept,region,social_object1_libelle,social_object2_libelle,longitude,latitude,facebook_url,helloasso_url
0,0,W751075369,ASSOCIATION NDOG-NÉM FRANCE ANNF,créer et favoriser les contacts le dialogue dv...,94,RUE,des Chataigniers,95100.0,Argenteuil,,94 RUE des Chataigniers 95100 Argenteuil,Val-d'Oise,Île-de-France,AMICALES/ GROUPEMENTS AFFINITAIRES/ GROUPEMENT...,AUTRES,2.252414,48.957136,https://fr-fr.facebook.com/Association-Dor%C3%...,https://www.helloasso.com/associations/associa...
1,1,W751101199,CERCLE D'ETUDES DES TRADITIONS ET DE LA CULTUR...,étudier les traditions et la culture camerouna...,148,AV,d'Italie,75013.0,Paris,,148 AV d'Italie 75013 Paris,Paris,Île-de-France,ASSOCIATIONS CARITATIVES/ HUMANITAIRES/ AIDE A...,AUTRES,2.358093,48.822629,https://www.facebook.com/CCMagdeburg/videos/tr...,https://www.helloasso.com/associations/cercle-...


In [17]:
df.to_csv("ref-rna-real-mars-2022-enriched-not-qualified.csv")