In [10]:
import pandas as pd
import os
import glob
from difflib import SequenceMatcher

In [27]:
keep_col = ['Titre', 'URL']

In [28]:
folder_path = '/Users/carboni/Documents/UNIGE/pynotebook/sourcesVC/Original/VC_original'

In [29]:
all_files = glob.glob(folder_path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, delimiter=',', low_memory=False)
    df = df[keep_col]
    df['source'] = filename.split("/")[-1]
    li.append(df)

df_merged = pd.concat(li, axis=0, ignore_index=True)

In [30]:
df_merged.drop_duplicates(subset="Titre", keep=False, inplace=True)

In [32]:
df_merged.reset_index()

Unnamed: 0,index,Titre,URL,source
0,85,Elle Finland,https://coversarchive.wixsite.com/gallery/elle...,celine.csv
1,144,Elle Hungary,https://coversarchive.wixsite.com/gallery/elle...,celine.csv
2,159,Elle Indonesia,https://coversarchive.wixsite.com/gallery/elle...,celine.csv
3,415,Elle Vietnam,https://coversarchive.wixsite.com/gallery/elle...,celine.csv
4,3139,Antártica: preocupación de 5 continentes,http://www.memoriachilena.gob.cl/602/w3-proper...,celine.csv
...,...,...,...,...
548,41191,Almanaque etimologico y poetico,http://bdh.bne.es/bnesearch/CompleteSearch.do?...,celine_1.csv
549,41192,"Almanaque festivo (Madrid, 1876)",http://bdh.bne.es/bnesearch/CompleteSearch.do?...,celine_1.csv
550,41193,Almanaque franciscano,http://bdh.bne.es/bnesearch/CompleteSearch.do?...,celine_1.csv
551,41194,Almanaque literario y ilustrado,http://bdh.bne.es/bnesearch/CompleteSearch.do?...,celine_1.csv


In [33]:
current = pd.read_csv('/Users/carboni/Documents/UNIGE/pynotebook/sourcesVC/Original/Corpuscombined light Oct23.csv')

In [34]:
current.head()

Unnamed: 0,Media URL,City,Country,wkt,normalized_date,Title,Journal Type,source
0,https://iiif.unige.ch/dhportal/ug27808014/mani...,Amsterdam,Netherlands,POINT(4.9 52.383333333333),1973-01-01,Kontekst,Illustrated Magazine,Adrien iaddb data iiif(UNIGE).csv
1,https://iiif.unige.ch/dhportal/ug27803682/mani...,Paris,France,POINT(2.352222222222222 48.85666666666667),1950-01-01,Art d’Aujourd’hui,Avant-garde Journal,Adrien iaddb data iiif(UNIGE).csv
2,https://iiif.unige.ch/dhportal/ug27817654/mani...,London,United Kingdom,POINT(-0.1275 51.507222222222),1923-01-01,Commercial Art - Art and Industry,Illustrated Magazine,Adrien iaddb data iiif(UNIGE).csv
3,https://iiif.unige.ch/dhportal/ug27803680/mani...,Amsterdam,Netherlands,POINT(4.9 52.383333333333),1967-01-01,Art Directors Annual,Illustrated Magazine,Adrien iaddb data iiif(UNIGE).csv
4,https://iiif.unige.ch/dhportal/ug27811437/mani...,Geneva,Switzerland,POINT(6.15 46.2),1947-01-01,Publicité et Arts graphiques,Illustrated Magazine,Adrien iaddb data iiif(UNIGE).csv


In [35]:
def get_similarity_score(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

def categorize_match(score):
    if score == 1:
        return "match"
    elif score > 0.9:
        return "very similar"
    elif score > 0.8:
        return "quite similar"
    else:
        return "no match"

matches = []

for index, row in df_merged.iterrows():
    title_merged = row['Titre']
    for _, curr_row in current.iterrows():
        title_current = curr_row['Title']
        similarity_score = get_similarity_score(title_merged, title_current)
        match_category = categorize_match(similarity_score)
        
        if match_category != "no match":
            matches.append((title_merged, title_current, match_category, similarity_score))

In [36]:
with open('matches.txt', 'w') as file:
    for match in matches:
        file.write(f"{match[0]} | {match[1]} | {match[2]} | {match[3]}\n")

In [37]:
matched_titles = [match[0] for match in matches]

# Find titles without matches
unmatched_df = df_merged[~df_merged['Titre'].isin(matched_titles)]

# Print and visualize the unmatched titles along with their source
print("Unmatched Titles:")
print(unmatched_df[['Titre', 'source', 'URL']])

Unmatched Titles:
                                    Titre        source  \
85                           Elle Finland    celine.csv   
144                          Elle Hungary    celine.csv   
159                        Elle Indonesia    celine.csv   
415                          Elle Vietnam    celine.csv   
3146                                Ecran    celine.csv   
...                                   ...           ...   
41191     Almanaque etimologico y poetico  celine_1.csv   
41192    Almanaque festivo (Madrid, 1876)  celine_1.csv   
41193               Almanaque franciscano  celine_1.csv   
41194     Almanaque literario y ilustrado  celine_1.csv   
41195  Almanaque literario (Madrid, 1935)  celine_1.csv   

                                                     URL  
85     https://coversarchive.wixsite.com/gallery/elle...  
144    https://coversarchive.wixsite.com/gallery/elle...  
159    https://coversarchive.wixsite.com/gallery/elle...  
415    https://coversarchive.wixsite.

In [38]:
unmatched_df.to_csv('unmatched_titles.csv', index=False)