In [45]:
import os
from tika import parser
import re
import pandas as pd
import requests

In [46]:
directory_path = '../../Tika_GeoTopic_Parser/Haunted_Places_Text_Files'
tsv_path = '../../data/haunted_places_entities.tsv'
output_path = '../../data/haunted_places_description.tsv'
gazetteer_url = "http://localhost:8765/api/search"

In [47]:
df_master_dataset = pd.read_csv(tsv_path, sep='\t', engine='python')
df_master_dataset

Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Entity Labels,Entity Texts
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.495480,42.960727,"['ORG', 'QUANTITY', 'FAC', 'ORG', 'GPE', 'TIME...","['Ada witch -', '3-mile', 'the Ada Cemetery', ..."
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,"['DATE', 'DATE']","['in.1 month later', 'this day']"
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,"['FAC', 'GPE', 'CARDINAL', 'CARDINAL', 'TIME',...","['Gorman Rd', 'Sand Creek', 'one', 'one', 'Lat..."
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,"['DATE', 'CARDINAL', 'CARDINAL', 'DATE', 'CARD...","['1970', 'one', '211', 'today', 'one', 'two', ..."
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.753030,42.243097,"['ORG', 'CARDINAL']",['Kappa Delta Sorority - The Kappa Delta Soror...
...,...,...,...,...,...,...,...,...,...,...,...,...
10987,Westminster,United States,at 12 midnight you can see a lady with two lit...,city hall,Colorado,CO,-105.048936,39.862610,-105.037205,39.836653,"['TIME', 'CARDINAL', 'PERSON']","['12 midnight', 'two', 'Sheridan St.']"
10988,Westminster,United States,Is haunted by the victims of a murder that hap...,Pillar of Fire,Colorado,CO,-105.032091,39.847237,-105.037205,39.836653,['DATE'],['years ago']
10989,Wheat Ridge,United States,The institution was for kids 18 years old and ...,Ridge Mental Institution,Colorado,CO,-105.063974,39.769726,-105.077206,39.766098,"['DATE', 'DATE', 'CARDINAL', 'CARDINAL']","['18 years old', '70', 'one', 'hundreds']"
10990,Wheat Ridge,United States,Gymnasium - their have been reports of a litt...,Wheat Ridge Middle School,Colorado,CO,-105.103613,39.764055,-105.077206,39.766098,,


In [48]:
# Loop through each row
for index, row in df_master_dataset.iterrows():
    description = str(row["description"]).strip()
    location = str(row["location"])

    filename = f"Haunted_Places_{index}.txt"
    filepath = os.path.join(directory_path, filename)

    # Write description to file
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(f"Location: {location}\n\n")
        f.write(f"Description:\n{description}")



    try:
        # Use Tika's GeoTopic parser
        parsed = parser.from_file(filepath, headers={"Content-Type": "application/geotopic"})
        metadata = parsed.get("metadata", {})

        name = metadata.get("Geographic_NAME", "NaN")
        lat = metadata.get("Geographic_LATITUDE", "NaN")
        lon = metadata.get("Geographic_LONGITUDE", "NaN")

        df_master_dataset.loc[index, "GeoTopic Name"] = name
        df_master_dataset.loc[index, "GeoTopic Latitude"] = lat
        df_master_dataset.loc[index, "GeoTopic Longitude"] = lon

        print(f"[✓] Processed index {index}: ({name}, {lat}, {lon})")

    except Exception as e:
        df_master_dataset.loc[index, "GeoTopic Name"] = "NaN"
        df_master_dataset.loc[index, "GeoTopic Latitude"] = "NaN"
        df_master_dataset.loc[index, "GeoTopic Longitude"] = "NaN"
        print(f"[X] Failed to process index {index}: {e}")

[✓] Processed index 0: (NaN, NaN, NaN)
[✓] Processed index 1: (NaN, NaN, NaN)
[✓] Processed index 2: (NaN, NaN, NaN)
[✓] Processed index 3: (Sterling Heights, 42.58031, -83.0302)
[✓] Processed index 4: (NaN, NaN, NaN)
[✓] Processed index 5: (NaN, NaN, NaN)
[✓] Processed index 6: (Rogue River, 42.43595, -123.172)
[✓] Processed index 7: (Harsens Island, 42.58948, -82.58852)
[✓] Processed index 8: (NaN, NaN, NaN)
[✓] Processed index 9: (NaN, NaN, NaN)
[✓] Processed index 10: (NaN, NaN, NaN)
[✓] Processed index 11: (NaN, NaN, NaN)
[✓] Processed index 12: (NaN, NaN, NaN)
[✓] Processed index 13: (NaN, NaN, NaN)
[✓] Processed index 14: (Parish of Saint Ann, 18.35, -77.26667)
[✓] Processed index 15: (NaN, NaN, NaN)
[✓] Processed index 16: (Kanton Basel-Landschaft, 47.50438, 7.70444)
[✓] Processed index 17: (NaN, NaN, NaN)
[✓] Processed index 18: (The Other Palace, 51.49889, -0.14216)
[✓] Processed index 19: (NaN, NaN, NaN)
[✓] Processed index 20: (NaN, NaN, NaN)
[✓] Processed index 21: (NaN, N

In [49]:
# Step 4: Save output to TSV
df_master_dataset.to_csv(output_path, sep="\t", index=False, encoding="utf-8")
print(f"\n Saved updated dataset to: {output_path}")


 Saved updated dataset to: ../../data/haunted_places_description.tsv


In [50]:
df_new = pd.read_csv(output_path, sep='\t', engine='python')
df_new

Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Entity Labels,Entity Texts,GeoTopic Name,GeoTopic Latitude,GeoTopic Longitude
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.495480,42.960727,"['ORG', 'QUANTITY', 'FAC', 'ORG', 'GPE', 'TIME...","['Ada witch -', '3-mile', 'the Ada Cemetery', ...",,,
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,"['DATE', 'DATE']","['in.1 month later', 'this day']",,,
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,"['FAC', 'GPE', 'CARDINAL', 'CARDINAL', 'TIME',...","['Gorman Rd', 'Sand Creek', 'one', 'one', 'Lat...",,,
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,"['DATE', 'CARDINAL', 'CARDINAL', 'DATE', 'CARD...","['1970', 'one', '211', 'today', 'one', 'two', ...",Sterling Heights,42.58031,-83.0302
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.753030,42.243097,"['ORG', 'CARDINAL']",['Kappa Delta Sorority - The Kappa Delta Soror...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10987,Westminster,United States,at 12 midnight you can see a lady with two lit...,city hall,Colorado,CO,-105.048936,39.862610,-105.037205,39.836653,"['TIME', 'CARDINAL', 'PERSON']","['12 midnight', 'two', 'Sheridan St.']",,,
10988,Westminster,United States,Is haunted by the victims of a murder that hap...,Pillar of Fire,Colorado,CO,-105.032091,39.847237,-105.037205,39.836653,['DATE'],['years ago'],,,
10989,Wheat Ridge,United States,The institution was for kids 18 years old and ...,Ridge Mental Institution,Colorado,CO,-105.063974,39.769726,-105.077206,39.766098,"['DATE', 'DATE', 'CARDINAL', 'CARDINAL']","['18 years old', '70', 'one', 'hundreds']",,,
10990,Wheat Ridge,United States,Gymnasium - their have been reports of a litt...,Wheat Ridge Middle School,Colorado,CO,-105.103613,39.764055,-105.077206,39.766098,,,,,


In [51]:
# Total entries
total_entries = len(df_new)

# Count valid (non-NaN) lat/lon rows
known_count = (~df_new["GeoTopic Latitude"].isna() & ~df_new["GeoTopic Longitude"].isna()).sum()
unknown_count = total_entries - known_count

# Calculate percentages
known_percentage = (known_count / total_entries) * 100
unknown_percentage = (unknown_count / total_entries) * 100

# Print results
print(f"Entries with valid coordinates: {known_count} ({known_percentage:.2f}%)")
print(f"Entries with NaN coordinates: {unknown_count} ({unknown_percentage:.2f}%)")


Entries with valid coordinates: 1461 (13.29%)
Entries with NaN coordinates: 9531 (86.71%)
