In [1]:
# Setup
import os
import pandas as pd
import re
import json
import threading
import spacy
from tqdm import tqdm

basedir = os.path.abspath(os.path.dirname("processing_data.ipynb"))
datadir = os.path.join(basedir, 'data\\enwiki20230820-stripped-json')

In [6]:
# Function to write JSON files to a folder
def writeFile(data, name, basedir=basedir):
    file_path = os.path.join(basedir, 'data\\', name)
    with open(file_path, "w") as file:
       json.dump(data, file, indent=2)

def loadFile(name, basedir=basedir):
    file_path = os.path.join(basedir, 'data\\', name)
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

def process_file(file_path, dict_list, landmarks, embeddings_landmarks):
    with open(file_path, 'r') as file:
        print("Handling file: ", file_path)
        for line in tqdm(file):
            info_dict = json.loads(line)
            if len(info_dict.get('title', '')) > 4:
                for i, landmark in enumerate(landmarks):
                    # title_set = set([info_dict['title']])
                    # if landmarks.intersection(title_set) and info_dict['text'] != '':

                    doc1 = nlp(info_dict.get('title', ''))
                    for i, landmark in enumerate(landmarks):
                        similarity_score = doc1.similarity(embeddings_landmarks[i])

                        if similarity_score > 0.97 and info_dict not in dict_list:
                            dict_list.append(info_dict)
                        # interesting_items.append(item)
                            if similarity_score == 1:
                                break
                            # if similarity_score < 1:
                            #     print(f"Found significant similarity between {item['title']} and {landmark} with score {similarity_score}")
                            # else:
                            #     break
    return dict_list

def process_folder(folder, dict_list, landmarks, embeddings_landmarks, debug, datadir=datadir):
    folder_path = os.path.join(datadir, folder)
    num_files = len(os.listdir(folder_path))
    for file_nr, filename in enumerate(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        
        # Create and start a thread for each file
        dict_list = process_file(file_path, dict_list, landmarks, embeddings_landmarks)
        
        if debug:
            print(f"{file_nr+1}/{num_files} - Finished processing '{filename}' in folder '{folder}'")
    return dict_list



In [4]:
df_unesco = pd.read_csv(os.path.join(basedir, 'data\\unesco_names.csv'))
landmarks = set(df_unesco['name_en'].to_list())

# GPU support
# spacy.require_gpu()
# nlp = spacy.load("en_core_web_sm", disable=["ner"])

# CPU support
nlp = spacy.load("en_core_web_sm")

embeddings_landmarks = []
for landmark in tqdm(landmarks):
    doc2 = nlp(landmark)
    embeddings_landmarks.append(doc2)

100%|██████████| 1157/1157 [00:05<00:00, 209.57it/s]


In [7]:
debug = True

list_of_dicts = []
                    
# Create and start a thread for each folder

num_files = len(os.listdir(datadir))
for file_nr, folder in enumerate(os.listdir(datadir)):
    print(f"----------------------\nStart processing folder '{folder}'")
    list_of_dicts = process_folder(folder, list_of_dicts, set(landmarks), embeddings_landmarks, debug)
    print(f"----------------------\nFinished processing folder '{folder}'")
    
writeFile(list_of_dicts, name='embedding_test.json')

----------------------
Start processing folder 'AA'
Handling file:  <_io.TextIOWrapper name='c:\\Users\\Beheerder\\Documents\\Prive\\University\\Year2\\Q1\\Text Mining\\assignment\\Text-Mining\\data\\enwiki20230820-stripped-json\\AA\\wiki_00' mode='r' encoding='cp1252'>


  similarity_score = doc1.similarity(embeddings_landmarks[i])


In [15]:
import spacy
from tqdm import tqdm

# Function to write JSON files to a folder
def writeFile(data, name, basedir=basedir):
    file_path = os.path.join(basedir, 'data\\', name)
    with open(file_path, "w") as file:
       json.dump(data, file, indent=2)

def loadFile(name, basedir=basedir):
    file_path = os.path.join(basedir, 'data\\', name)
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

def process_folder(folder, dict_list, landmarks, debug, datadir=datadir):
    folder_path = os.path.join(datadir, folder)
    num_files = len(os.listdir(folder_path))
    for file_nr, filename in enumerate(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        
        # Create and start a thread for each file
        thread = threading.Thread(target=process_file, args=(file_path, dict_list, landmarks))
        thread.start()
        thread.join()  # Wait for this thread to complete
        
        if debug:
            print(f"{file_nr+1}/{num_files} - Finished processing '{filename}' in folder '{folder}'")

data = loadFile(name='first_stage.json')
df_unesco = pd.read_csv(os.path.join(basedir, 'data\\unesco_names.csv'))
landmarks = set(df_unesco['name_en'].to_list())

spacy.require_gpu()
nlp = spacy.load("en_core_web_sm", disable=["ner"])

interesting_items = []

to_test = [item for item in data if len(item.get('title', '')) > 4]

embeddings_landmarks = []
for landmark in tqdm(landmarks):
    doc2 = nlp(landmark)
    embeddings_landmarks.append(doc2)

for item in tqdm(to_test):
    doc1 = nlp(item['title'])
    for i, landmark in enumerate(landmarks):
        similarity_score = doc1.similarity(embeddings_landmarks[i])

        if similarity_score > 0.97 and item not in interesting_items:
            interesting_items.append(item)
            if similarity_score < 1:
                print(f"Found significant similarity between {item['title']} and {landmark} with score {similarity_score}")
            else:
                break
            

100%|██████████| 1157/1157 [00:08<00:00, 135.47it/s]
  similarity_score = doc1.similarity(embeddings_landmarks[i])
 16%|█▌        | 494/3134 [02:11<12:09,  3.62it/s]

Found significant similarity between Yosemite National Park and Lorentz National Park with score 0.9713661670684814


 18%|█▊        | 550/3134 [02:26<12:09,  3.54it/s]

Found significant similarity between Doñana National Park and Simien National Park with score 0.9738708734512329


 19%|█▊        | 581/3134 [02:34<11:17,  3.77it/s]


KeyboardInterrupt: 

In [12]:
landmarks

{'Churches of Chiloé',
 'Archaeological Site of Mystras',
 'Sanganeb Marine National Park and Dungonab Bay – Mukkawar Island Marine National Park',
 'Seventeenth-Century Canal Ring Area of Amsterdam inside the Singelgracht',
 "Place Stanislas Place de la Carrière and Place d'Alliance in Nancy",
 'Town of Luang Prabang',
 'Prambanan Temple Compounds',
 'Jeju Volcanic Island and Lava Tubes',
 'Verla Groundwood and Board Mill',
 'Fanjingshan',
 'Archaeological Sites of Bat Al-Khutm and Al-Ayn',
 'Historic Centre of Lima',
 'Curonian Spit',
 'Imperial Palaces of the Ming and Qing Dynasties in Beijing and Shenyang',
 'Monticello and the University of Virginia in Charlottesville',
 'Wieliczka and Bochnia Royal Salt Mines',
 'Historic District of Old Québec',
 "Jewish Quarter and St Procopius' Basilica in Třebíč",
 'Jelling Mounds Runic Stones and Church',
 'Christiansfeld a Moravian Church Settlement',
 "Nelson's Dockyard",
 'Historic Mosque City of Bagerhat',
 'Red Bay Basque Whaling Statio

In [None]:
writeFile(interesting_items, name='interesting.json')

In [57]:
# count = 0
# title_split = item['title'].split()
# for word in title_split:
#     if word in landmark and len(word) > 1:
#         # print(f"Found {word} in {landmark}")
#         count += 1
# if (
#     (count > 3 or (count == 1 and len(landmark.split()) == 1) or (count == 2 and len(landmark.split()) == 2)) 
#     and item not in interesting_list
#     ):

In [59]:
for item in interesting_list:
    print(item['title'])

Ajanta Caves
Classical Gardens of Suzhou
Per
Phoenix Islands Protected Area
Shushtar Historical Hydraulic System
Ephesus
Central Highlands of Sri Lanka
Le Corbusier
Leonardo da Vinci
Leto
Cave of Altamira and Paleolithic Cave Art of Northern Spain
Masada
Pimachiowin Aki
Coro and its Port
Pet
Rio de Janeiro
Robben Island
Royal Botanic Gardens, Kew
Shire
San Marino
Santiago de Compostela
Zuojiang Huashan Rock Art
Xantho
Surtsey
Tax
M'banza-Kongo
Djoudj National Bird Sanctuary
Fan
Ur
Catalan Romanesque Churches of the Vall de Boí
Vatican City
Royal Palaces of Abomey
Namhan
Timbuk
Hidden Christian Sites in the Nagasaki Region
Choir
Petra
San Millán
Grand Canyon National Park
San Francisco
Saloum Delta
Residence of Bukovinian and Dalmatian Metropolitans
Taos Pueblo
San Antonio
Le Havre
Historic Monuments of Novgorod and Surroundings
Lake Baikal
Baal
Baalbek
Historic Centre of Naples
Susa
Church of the Nativity
Wadden Sea
Mammoth Cave National Park
San Juan
Mausoleum of the First Qin Emperor