In [1]:
import pandas as pd

# Load the catalog_4489.tsv file
catalog_path = 'www/python/src/data/catalog_4489.tsv'
catalog_pid = "P7197"
df = pd.read_csv(catalog_path, sep='\t')

# Display the first few rows to understand the structure
print("First 5 rows of the dataset:")
print(df.head())

# Filter for entries with country: UY, with a "q" value, and mnm_user_id == 0
filtered_df = df[
    (df['description'].str.contains('country: UY', na=False)) & 
    (df['q'].notna()) & 
    (df['mnm_user_id'] == 0)
]

# Display the filtered results
print("\nFiltered results (country: UY, with q value, and mnm_user_id == 0):")
print(filtered_df)
print(f"\nTotal matching entries: {len(filtered_df)}")

First 5 rows of the dataset:
   #entry_id  catalog  external_id                          external_url  \
0  113398375     4489          833   https://openweathermap.org/city/833   
1  113398376     4489         2960  https://openweathermap.org/city/2960   
2  113398377     4489         3245  https://openweathermap.org/city/3245   
3  113398378     4489         3530  https://openweathermap.org/city/3530   
4  113398379     4489         5174  https://openweathermap.org/city/5174   

            name                              description entry_type  \
0  Ḩeşār-e Sefīd  country: IR, coord: 34.330502,47.159401    Q618123   
1   ‘Ayn Ḩalāqīm  country: SY, coord: 34.940079,36.321911    Q618123   
2         Taglag   country: IR, coord: 38.450001,44.98333    Q618123   
3       Qabāghlū  country: IR, coord: 36.173302,46.168499    Q618123   
4        ‘Arīqah   country: SY, coord: 32.889809,36.48336    Q618123   

   mnm_user_id         q    matched_on        lat        lon  
0    9414042.0  Q5

In [3]:
from wdcuration import query_wikidata
def get_dict_from_list_of_wikidata_items(list_of_items,property):
    """
    Takes a list of Wikidata items and returns a dictionary with the item IDs as keys and the specified property values as values.
    """
    query = """
    SELECT ?item ?property WHERE {
        VALUES ?item { wd:%s }
        ?item wdt:%s ?property.
        }
    """ % (' wd:'.join(list_of_items), property)

    results = query_wikidata(query)

    final_dict = {}

    for item in list_of_items:
        final_dict[item] = None
    for result in results:
        item = result['item'].split('/')[-1]
        property_value = result['property']
        final_dict[item] = property_value
    return final_dict

# Get the list of Wikidata items from the filtered DataFrame
list_of_items = filtered_df['q'].unique().tolist()
# Get the property values for the list of items
property = 'P625'  # Example property (coordinates)

# Call the function to get the dictionary of property values
property_dict = get_dict_from_list_of_wikidata_items(list_of_items, property)
# Display the property dictionary
print("\nProperty dictionary:")
print(property_dict)


Property dictionary:
{'Q2733083': None, 'Q1022308': 'Point(-54.413888888 -33.252777777)', 'Q1110810': 'Point(-57.265277777 -34.338333333)', 'Q6754559': None, 'Q830390': 'Point(-54.382222222 -33.230833333)', 'Q5092478': 'Point(-83.5278 41.6526)', 'Q1727968': 'Point(-57.614484787 -34.265125956)', 'Q8773451': 'Point(-57.413 -33.2037)', 'Q833016': 'Point(-55.982777777 -31.714444444)', 'Q1514901': 'Point(-58.321666666 -33.398055555)', 'Q2096045': 'Point(-55.466666666 -34.6)', 'Q1783586': 'Point(-56.330277777 -33.725)', 'Q5780075': None, 'Q1508561': 'Point(-54.95 -32.916666666)', 'Q6120278': 'Point(-81.0083 22.9114)', 'Q1022299': 'Point(-56.516666666 -33.35)', 'Q983435': 'Point(-56.716666666 -34.333333333)', 'Q6118930': 'Point(-79.3951 -0.14672)', 'Q3892774': 'Point(-98.333333333 17.575)', 'Q5717175': 'Point(-67.6972 -30.3194)', 'Q4848757': 'Point(-24.3353 16.5238)', 'Q984390': 'Point(-56.159444444 -34.921888888)', 'Q530827': 'Point(-57.616666666 -32.533333333)', 'Q12221221': 'Point(-56.543

In [4]:
import numpy as np
from math import radians, cos, sin, asin, sqrt

# Function to calculate the Haversine distance between two points
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371  # Radius of earth in kilometers
    return c * r

# Extract Wikidata coordinates from the property dictionary
wikidata_coords = {}
for qid, coord_str in property_dict.items():
    if coord_str is not None:
        # Parse the Point format: Point(lon lat)
        try:
            # Remove 'Point(' and ')', then split by space
            lon_lat = coord_str.replace('Point(', '').replace(')', '').split()
            lon = float(lon_lat[0])
            lat = float(lon_lat[1])
            wikidata_coords[qid] = (lat, lon)
        except:
            print(f"Could not parse coordinates for {qid}: {coord_str}")

# Create a new DataFrame to store the results
results_df = filtered_df.copy()
results_df['wikidata_lat'] = np.nan
results_df['wikidata_lon'] = np.nan
results_df['distance_km'] = np.nan

# Calculate the distance between OpenWeatherMap and Wikidata coordinates
for index, row in results_df.iterrows():
    qid = row['q']
    if qid in wikidata_coords:
        wikidata_lat, wikidata_lon = wikidata_coords[qid]
        results_df.at[index, 'wikidata_lat'] = wikidata_lat
        results_df.at[index, 'wikidata_lon'] = wikidata_lon
        
        # Calculate Haversine distance
        distance = haversine(row['lon'], row['lat'], wikidata_lon, wikidata_lat)
        results_df.at[index, 'distance_km'] = distance

# Sort by distance and display the results
results_df = results_df.sort_values('distance_km')
print("Results sorted by distance between OpenWeatherMap and Wikidata coordinates:")
print(results_df[['name', 'q', 'lat', 'lon', 'wikidata_lat', 'wikidata_lon', 'distance_km']].head(20))

# Find entries with large discrepancies
large_distance_threshold = 10  # km
large_discrepancies = results_df[results_df['distance_km'] > large_distance_threshold]
print(f"\nEntries with distance > {large_distance_threshold} km:")
print(large_discrepancies[['name', 'q', 'lat', 'lon', 'wikidata_lat', 'wikidata_lon', 'distance_km']].head(10))

# Summary statistics
print("\nSummary statistics for distances (km):")
print(results_df['distance_km'].describe())

Results sorted by distance between OpenWeatherMap and Wikidata coordinates:
                          name         q        lat        lon  wikidata_lat  \
129855               Mariscala  Q1005301 -34.040852 -54.777321    -34.041000   
129826          Punta Carretas   Q984390 -34.922779 -56.159721    -34.921889   
129837        Paso de Carrasco  Q2055651 -34.860279 -56.052219    -34.859450   
129782          Treinta y Tres   Q830390 -33.233330 -54.383331    -33.230833   
129791           Villa Soriano  Q1514901 -33.400002 -58.316669    -33.398056   
129891      Dieciocho de Julio   Q204526 -33.683331 -53.549999    -33.683333   
129804        San José de Mayo   Q983435 -34.337502 -56.713612    -34.333333   
129886           Empalme Olmos   Q630016 -34.700001 -55.900002    -34.695833   
129898  Colonia del Sacramento    Q56064 -34.466671 -57.849998    -34.471389   
129925        Aguas Corrientes   Q397975 -34.521938 -56.393608    -34.516667   
132430       Barra de Carrasco   Q808781 -34

In [5]:
# Create a final output with categorized matches
# Define thresholds for match quality
excellent_match = 1  # km
good_match = 5  # km
poor_match = 10  # km

# Categorize matches
results_df['match_quality'] = 'Unknown'
results_df.loc[results_df['distance_km'] <= excellent_match, 'match_quality'] = 'Excellent'
results_df.loc[(results_df['distance_km'] > excellent_match) & (results_df['distance_km'] <= good_match), 'match_quality'] = 'Good'
results_df.loc[(results_df['distance_km'] > good_match) & (results_df['distance_km'] <= poor_match), 'match_quality'] = 'Fair'
results_df.loc[results_df['distance_km'] > poor_match, 'match_quality'] = 'Poor'
results_df.loc[results_df['distance_km'].isna(), 'match_quality'] = 'No Wikidata coordinates'

# Count by category
match_counts = results_df['match_quality'].value_counts()
print("Match quality distribution:")
print(match_counts)

# Display examples from each category
print("\nExcellent matches (distance ≤ 1 km):")
print(results_df[results_df['match_quality'] == 'Excellent'][['name', 'q', 'distance_km']].head(5))

print("\nGood matches (1 km < distance ≤ 5 km):")
print(results_df[results_df['match_quality'] == 'Good'][['name', 'q', 'distance_km']].head(5))

print("\nFair matches (5 km < distance ≤ 10 km):")
print(results_df[results_df['match_quality'] == 'Fair'][['name', 'q', 'distance_km']].head(5))

print("\nPoor matches (distance > 10 km):")
print(results_df[results_df['match_quality'] == 'Poor'][['name', 'q', 'distance_km']].head(5))

print("\nEntries without Wikidata coordinates:")
print(results_df[results_df['match_quality'] == 'No Wikidata coordinates'][['name', 'q']].head(5))

# Save the results to a CSV file for further analysis
results_df.to_csv('uruguay_openweather_wikidata_matches.csv', index=False)
print("\nResults saved to 'uruguay_openweather_wikidata_matches.csv'")

Match quality distribution:
match_quality
Poor                       27
Good                       18
Excellent                  13
No Wikidata coordinates     8
Fair                        1
Name: count, dtype: int64

Excellent matches (distance ≤ 1 km):
                    name         q  distance_km
129855         Mariscala  Q1005301     0.033847
129826    Punta Carretas   Q984390     0.102137
129837  Paso de Carrasco  Q2055651     0.228290
129782    Treinta y Tres   Q830390     0.296152
129791     Villa Soriano  Q1514901     0.511944

Good matches (1 km < distance ≤ 5 km):
                    name         q  distance_km
132428      Villa García  Q7930332     1.008275
129865           Lascano  Q1013691     1.010829
129895            Cordón  Q1005742     1.087712
129845    Nueva Helvecia   Q999572     1.235388
129792  Solís de Mataojo  Q2096045     1.525171

Fair matches (5 km < distance ≤ 10 km):
              name         q  distance_km
129847  Montevideo  Q4863736     8.638956

Po

In [7]:
# Create a Quickstatements V2 file for results with <1 km distance
# Filter for excellent matches (distance < 1 km)
excellent_matches = results_df[results_df['distance_km'] < 1]
print(f"Number of excellent matches (distance < 1 km): {len(excellent_matches)}")
catalog_pid = "P7197"

# Format for Quickstatements V2
# Format: QID|P625|@LAT/LON
quickstatements_lines = []

for _, row in excellent_matches.iterrows():
    qid = row['q']
    entry_id_on_catalog = row['external_id']
    
    # Format: QID|P625|@LAT/LON
    qs_line = f'{qid}|{catalog_pid}|"{entry_id_on_catalog}"'
    quickstatements_lines.append(qs_line)
    
# Write to file
with open('uruguay_excellent_matches_quickstatements.txt', 'w') as f:
    f.write('\n'.join(quickstatements_lines))

print("Quickstatements V2 file created: uruguay_excellent_matches_quickstatements.txt")
print("\nSample of Quickstatements content:")
for line in quickstatements_lines[:5]:
    print(line)

Number of excellent matches (distance < 1 km): 13


KeyError: 'id'