In [123]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from unidecode import unidecode

import requests
import json

import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from kiblib.utils.db import DbConn
from kiblib.utils.code2libelle import Code2Libelle
from kiblib.adherent import Adherent

In [124]:
def texte_comp(input, referentiel):
    i = len(input)
    chaines = referentiel + input

    vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='char_wb', ngram_range=(3,3))
    X = vectorizer.fit_transform(chaines)

    s = cosine_similarity(X[-i:], X[:-i])
    res = [referentiel[idx] for idx in np.argmax(s, axis = 1)]

    return res

In [125]:
def mod_borrower(userid, data):
    api_url = "http://cataloguekoha.ntrbx.local/cgi-bin/koha/rest.pl/user"
    url = f"{api_url}/{userid}"
    data = json.dumps(data)
    data2mod = f"dhttp://localhost:8889/notebooks/kibini2/kibini/adm_analyse_borrowers.ipynb#ata={data}"
    print(data2mod)
    response = requests.put(url, data=data2mod)
    print(f"{userid} : {response.content}")

In [126]:
db_conn = DbConn().create_engine()

c2l = Code2Libelle(db_conn)
c2l.get_val()
c2l = c2l.dict_codes_lib

In [127]:
query = "SELECT * from statdb.iris"
ref_adresses = pd.read_sql(query, con=db_conn)

In [128]:
villes_fr = pd.read_excel("../data/villes_france_wikidata.xlsx")
villes_be = pd.read_excel("../data/villes_belgique_wikidata.xlsx")

In [129]:
adh = pd.read_json("http://cataloguekoha.ntrbx.local/cgi-bin/koha/rest.pl/user")

In [130]:
adh.columns

Index(['date_renewed', 'B_address', 'dateofbirth', 'B_zipcode', 'opacnote',
       'altcontactaddress3', 'password', 'sort2', 'fax', 'B_address2',
       'altcontactsurname', 'debarred', 'emailpro', 'branchcode', 'address',
       'login_attempts', 'autorenew_checkouts', 'dateenrolled', 'phone',
       'altcontactfirstname', 'borrowernotes', 'B_email', 'country',
       'B_streetnumber', 'address2', 'altcontactaddress2', 'sms_provider_id',
       'sex', 'lang', 'relationship', 'altcontactzipcode', 'othernames',
       'lastseen', 'state', 'privacy', 'altcontactstate', 'streettype',
       'categorycode', 'email', 'contactfirstname', 'dateexpiry',
       'streetnumber', 'privacy_guarantor_fines', 'cardnumber',
       'checkprevcheckout', 'updated_on', 'primary_contact_method', 'userid',
       'flags', 'overdrive_auth_token', 'lost', 'altcontactcountry',
       'contactnote', 'B_phone', 'B_country', 'altcontactaddress1',
       'debarredcomment', 'contactname', 'borrowernumber',
       

In [131]:
Adh = Adherent(df=adh, db_conn=db_conn, c2l=c2l)

In [132]:
Adh.get_adherent_statdb_data()
Adh.get_adherent_es_data()

In [133]:
adherents = Adh.df

## Vérification des adresses : alignement avec le référentiel Roubaix

In [134]:
rbx = adherents[adherents['adh_geo_gentilite'] == 'Roubaisien']

In [139]:
rbx_adr = rbx[['userid', 'borrowernumber', 'cardnumber', 'categorycode', 'address',
       'address2', 'city', 'state', 'zipcode', 'country', 'altcontactcountry', 'dateexpiry']][rbx['categorycode'].isin(['CSVT', 'BIBL', 'MEDA', 'MEDB', 'MEDC'])]

In [140]:
rbx_adr = rbx_adr.merge(ref_adresses, how='left', left_on='address', right_on='adresse')

In [141]:
rbx_adr['statut'] = False
rbx_adr.loc[rbx_adr['id_cicn2'].notna(), 'statut'] = True

In [142]:
rbx_adr['statut'].value_counts(normalize=True)

True     0.966964
False    0.033036
Name: statut, dtype: float64

In [143]:
rbx_adr_ko = rbx_adr[rbx_adr['statut'] == False]
rbx_adr_ko = rbx_adr_ko[['userid', 'borrowernumber', 'cardnumber', 'categorycode', 'address',
       'address2', 'city', 'state', 'zipcode', 'country', 'altcontactcountry', 'dateexpiry']]

### Premier passage : on nettoie l'adresse pour essayer de réaligner

In [144]:
rbx_adr_ko['clean_address'] = rbx_adr_ko['address']
rbx_adr_ko['clean_address'] = rbx_adr_ko['clean_address'].str.upper()
rbx_adr_ko['clean_address'] = rbx_adr_ko['clean_address'].apply(lambda x: unidecode(x))
rbx_adr_ko['clean_address'] = rbx_adr_ko['clean_address'].str.replace("'|\/|\s+", " ", regex=True)

In [145]:
rbx_adr_ko.columns

Index(['userid', 'borrowernumber', 'cardnumber', 'categorycode', 'address',
       'address2', 'city', 'state', 'zipcode', 'country', 'altcontactcountry',
       'dateexpiry', 'clean_address'],
      dtype='object')

In [146]:
rbx_adr_ko_cleaned = rbx_adr_ko.merge(ref_adresses, how='left', left_on='clean_address', right_on='adresse')
rbx_adr_ko_cleaned['statut'] = False
rbx_adr_ko_cleaned.loc[rbx_adr_ko_cleaned['id_cicn2'].notna(), 'statut'] = True
rbx_adr_ko_cleaned['statut'].value_counts(normalize=True)

False    0.995
True     0.005
Name: statut, dtype: float64

In [147]:
rbx_adr_ko_cleaned.columns

Index(['userid', 'borrowernumber', 'cardnumber', 'categorycode', 'address',
       'address2', 'city', 'state', 'zipcode', 'country', 'altcontactcountry',
       'dateexpiry', 'clean_address', 'adresse', 'id_cicn2', 'irisInsee',
       'streetid', 'streetlabel', 'streetnumber', 'streetsuffix', 'origine',
       'statut'],
      dtype='object')

### On injecte les adresses corrigées

In [149]:
for b in rbx_adr_ko_cleaned[rbx_adr_ko_cleaned['statut'] == True].to_dict(orient='records'):
    data = {"address": b["clean_address"], "state": b["id_cicn2"], "altcontactcountry": b["irisInsee"]}
    mod_borrower(b["userid"], data)

data={"address": "24 RUE PAYEN", "state": "2664", "altcontactcountry": "595120803"}
X0002262697 : b'{\n   "modified_fields" : {},\n   "success" : true\n}\n'
data={"address": "24 RUE PAYEN", "state": "2664", "altcontactcountry": "595120803"}
X0002508825 : b'{\n   "success" : true,\n   "modified_fields" : {}\n}\n'
data={"address": "24 RUE PAYEN", "state": "2664", "altcontactcountry": "595120803"}
X0002508832 : b'{\n   "modified_fields" : {},\n   "success" : true\n}\n'


### 2e passage : similarité cosine

In [150]:
rbx_adr_ko2 = rbx_adr_ko_cleaned[rbx_adr_ko_cleaned['statut'] == False]

In [151]:
address_prop = texte_comp(rbx_adr_ko2['clean_address'].tolist(), ref_adresses['adresse'].tolist())

In [152]:
rbx_adr_ko2['address_prop'] = address_prop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rbx_adr_ko2['address_prop'] = address_prop


In [154]:
rbx_adr_ko2[['address', 'address_prop']]

Unnamed: 0,address,address_prop
0,4 A AVENUE DES COTTAGES,4 AVENUE DES COTTAGES
1,A 5 Allée Maurice Maertens,0 A ALLEE MAURICE MAERTENS
2,42 RUE HOCHE,2 RUE HOCHE
3,RUE D YPRES,2 RUE D YPRES
4,29 BIS RUE DE SEBASTOPOL,29 RUE DE SEBASTOPOL
...,...,...
595,63 D Allée des Saules,74 D ALLEE DES SAULES
596,10 C 39 Mail Notre Dame,10 MAIL NOTRE DAME
597,10 C 39 Mail Notre Dame,10 MAIL NOTRE DAME
598,86 RUE HENRI CARRETTE,86 RUE HENRI CARETTE


In [155]:
rbx_adr_ko2.to_excel("test.xlsx", index=False)

In [None]:
data={'address': '2 rue Pierre Motte', 'city':'Roubaix'}