In [None]:
def trouver_difference(str1, str2):
    # Trouver les mots dans chaque chaîne
    mots_str1 = set(str1.split())
    mots_str2 = set(str2.split())
    
    # Trouver les mots qui sont dans une chaîne mais pas dans l'autre
    diff = mots_str1.symmetric_difference(mots_str2)
    
    # Retourner la différence sous forme de chaîne
    return ' '.join(diff)

# Exemples d'utilisation
diff1 = trouver_difference("RTE DE BAPAUME", "ROUTE DE BAPAUME")
diff2 = trouver_difference("RUE DS TRIBUNAUX", "RUE DES TRIBUNAUX")
diff3 = trouver_difference("RUE FERDINAND DECARPENTRY", "RUE DECARPENTRY")

print(diff1)  # Output: OU
print(diff2)  # Output: E
print(diff3)  # Output: FERDINAND


In [None]:
# IMPORT
import pandas as pd
import random
from random import randint
from difflib import SequenceMatcher
from math import floor

In [None]:
PATH = "df_avec_dist_lev_prop"
df = pd.read_csv(PATH,encoding="UTF8")
df.head(3)

In [None]:
PATH = "CEHDF_BAN_GEOCODAGE.csv"
df_b = pd.read_csv(PATH,encoding="UTF8")
df_b.head(3)

In [None]:
# Fonctions 
def lev(s, t):
    m = len(s)
    n = len(t)
    d = [[0] * (n + 1) for i in range(m + 1)]  

    for i in range(1, m + 1):
        d[i][0] = i

    for j in range(1, n + 1):
        d[0][j] = j
    
    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if s[i - 1] == t[j - 1]:
                cost = 0
            else:
                cost = 1
            d[i][j] = min(d[i - 1][j] + 1,      # deletion
                          d[i][j - 1] + 1,      # insertion
                          d[i - 1][j - 1] + cost) # substitution   

    return d[m][n]




# Function to calculate the 
# Jaro Similarity of two strings 
def jaro_distance(s1, s2) :

	# If the strings are equal 
	if (s1 == s2) :
		return 1.0; 

	# Length of two strings 
	len1 = len(s1);
	len2 = len(s2); 

	if (len1 == 0 or len2 == 0) :
		return 0.0; 

	# Maximum distance upto which matching 
	# is allowed 
	max_dist = (max(len(s1), len(s2)) // 2 ) - 1; 

	# Count of matches 
	match = 0; 

	# Hash for matches 
	hash_s1 = [0] * len(s1) ;
	hash_s2 = [0] * len(s2) ; 

	# Traverse through the first string 
	for i in range(len1) : 

		# Check if there is any matches 
		for j in range( max(0, i - max_dist), 
					min(len2, i + max_dist + 1)) : 
			
			# If there is a match 
			if (s1[i] == s2[j] and hash_s2[j] == 0) : 
				hash_s1[i] = 1; 
				hash_s2[j] = 1; 
				match += 1; 
				break; 
		
	# If there is no match 
	if (match == 0) :
		return 0.0; 

	# Number of transpositions 
	t = 0; 

	point = 0; 

	# Count number of occurrences 
	# where two characters match but 
	# there is a third matched character 
	# in between the indices 
	for i in range(len1) : 
		if (hash_s1[i]) :

			# Find the next matched character 
			# in second string 
			while (hash_s2[point] == 0) :
				point += 1; 

			if (s1[i] != s2[point]) :
				point += 1;
				t += 1;
			else :
				point += 1;
				
		t /= 2; 

	# Return the Jaro Similarity 
	return ((match / len1 + match / len2 +
			(match - t) / match ) / 3.0); 

# Jaro Winkler Similarity 
def jaro_Winkler(s1, s2) : 

	jaro_dist = jaro_distance(s1, s2); 

	# If the jaro Similarity is above a threshold 
	if (jaro_dist > 0.7) :

		# Find the length of common prefix 
		prefix = 0; 

		for i in range(min(len(s1), len(s2))) :
		
			# If the characters match 
			if (s1[i] == s2[i]) :
				prefix += 1; 

			# Else break 
			else :
				break; 

		# Maximum of 4 characters are allowed in prefix 
		prefix = min(4, prefix); 

		# Calculate jaro winkler Similarity 
		jaro_dist += 0.1 * prefix * (1 - jaro_dist); 

	return jaro_dist; 

# Driver code 
if __name__ == "__main__" : 

	s1 = "TRATE"; s2 = "TRACE"; 

	# Print Jaro-Winkler Similarity of two strings 
	print("Jaro-Winkler Similarity =", jaro_Winkler(s1, s2)) ; 

def lcs_dist(a,b):
    s = SequenceMatcher(a=a, b=b)
    return list(s.find_longest_match(0, len(a), 0, len(b)))[2]

def process_df(df, df_b, filter_conditions, score_func, proposition_col):
    i = 0
    for _, row in df.iterrows():
        df_b_filtered = df_b[filter_conditions(df_b, row)]
        if not df_b_filtered.empty:
            scores = df_b_filtered['rue'].apply(lambda x: score_func(str(x), str(row['rue'])))
            max_score_index = scores.idxmin() if score_func == lev else scores.idxmax()
            proposition_max_score = df_b_filtered.loc[max_score_index].copy()
            df.at[_, proposition_col] = proposition_max_score['rue']
            
        else:  
            df.at[_, proposition_col] = "rien"
            
        i += 1
        if i % 10 == 0:
            print(f"Processed {i} rows")
    return df

filter_conditions = lambda df_b, row: (
    (df_b['numero'] == row['numero']) &
    (df_b['rep'] == row['rep']) &
    (df_b['commune'] == row['commune'])
)

filter_conditions_com = lambda df_b, row: (
    (df_b['commune'] == row['commune'])
)

In [None]:
df_b.fillna('1', inplace=True)

In [None]:
dataframe_BAN = df_b[['commune','rue','libelle_commune']].groupby(['commune','rue','libelle_commune']).first().reset_index()

In [None]:
df_to_compute = df[['commune','rue_x','libelle_commune']].groupby(['commune','rue_x','libelle_commune']).first().reset_index()

In [None]:
dataframe_BAN.loc[dataframe_BAN['commune'] == 80650]

In [None]:
test = df_to_compute.merge(dataframe_BAN, on=['commune','libelle_commune'] , how='left')

In [None]:
!pip install textdistance

In [None]:
#from textdistance import levenshtein
#test['levenshtein_communes'] = test.apply(lambda x: levenshtein.distance(x['rue_x'], x['rue_y']), axis=1)

In [None]:
test.fillna('rien', inplace=True)

In [None]:
from rapidfuzz.distance import Levenshtein
Levenshtein.distance("lewenstein", "levenshtein")

In [None]:
teste = test.sample(100000)

In [None]:
test['lev_distance_communes'] = test.apply(lambda row: Levenshtein.distance(row['rue_x'], row['rue']), axis=1)

In [None]:
# IMPORT
import pandas as pd
import random
from random import randint
from difflib import SequenceMatcher
from math import floor

In [None]:
def lcs_dist(a,b):
    s = SequenceMatcher(a=a, b=b)
    return list(s.find_longest_match(0, len(a), 0, len(b)))[2]

In [None]:
test['lcs_commune'] = test.apply(lambda row: lcs_dist(row['rue_x'], row['rue']), axis=1)

In [None]:
idx2 = test.groupby(['rue_x', 'commune','libelle_commune'])['lcs_commune'].idxmax()

In [None]:
df_resultat_lcs = test.loc[idx2]

In [None]:
df_resultat_lcs

In [None]:


# Supposons que votre DataFrame s'appelle df
# et qu'il a les colonnes 'numero', 'rep', 'rue_x', 'rue_y', et 'dist_lev'

# Grouper par 'rue_x' et 'commune', puis trouver l'index de la valeur minimale de 'dist_lev'
idx = test.groupby(['rue_x', 'commune','libelle_commune'])['lev_distance_communes'].idxmin()

# Utiliser l'index pour obtenir les lignes correspondantes du DataFrame
df_resultat = test.loc[idx]




In [None]:
df_resultat_lcs['rue_init'] = df_resultat_lcs['rue_x']

In [None]:
df_resultat_lcs['rue_lcs_com'] = df_resultat_lcs['rue']

In [None]:
df_resultat_lcs.columns

In [None]:
df_resultat_lcs = df_resultat_lcs[['commune',  'libelle_commune','lcs_commune', 'rue_init', 'rue_lcs_com']]

In [None]:
df_resultat['rue_init'] = df_resultat['rue_x']

In [None]:
test = df_resultat.merge(df_resultat_lcs, on= ['commune','libelle_commune','rue_init'] , how='left')

In [None]:
test['rue_lev_com'] = test['rue']

In [None]:
test.columns

In [None]:
test = test[['commune', 'libelle_commune', 'lev_distance_communes',
       'lcs_commune_x', 'rue_init', 'lcs_commune_y', 'rue_lcs_com',
       'rue_lev_com']]

In [None]:
df['rue_init'] = df['rue_x']

In [None]:
df.columns

In [None]:
df=df[['commune', 'libelle_commune', 'numero', 'rep',
       'rue_y', 'lev_distance', 'lev_distance', 'lcs', 'lcs_ratio1',
       'lcs_ratio2', 'jaro-winkler', 'score_min', 'rue_init']]

In [None]:
union = df.merge(test, on= ['commune','libelle_commune','rue_init'] , how='left')

In [None]:
union

In [None]:
union.to_csv("DF_FINAL.csv")