In [1]:
import pandas as pd

from geopy.geocoders import Nominatim
from joblib import Parallel, delayed
from tqdm import tqdm
from argparse import Namespace

In [4]:
config = {
    # "fin": "../data/csv_performance_all_models/xlmt_inference_test_set.csv",
    "fin": "/data3/mmendieta/Violence_data/csv_files_global_scale/xlmt_inference_test_set.csv",
    # "fout": "../data/csv_performance_all_models/xlmt_inference_test_set_with_country.csv"
    "fout": "/data3/mmendieta/Violence_data/csv_files_global_scale/xlmt_inference_test_set_with_country.csv"
}

args = Namespace(**config)

In [5]:
# read the file
df = pd.read_csv(args.fin, engine='python', on_bad_lines='skip', encoding='utf-8')

In [6]:
df[1:3]

Unnamed: 0,tweetid,text,lang,geo_x,geo_y,post7geo10_true,post7geo30_true,post7geo50_true,pre7geo10_true,pre7geo30_true,pre7geo50_true,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
1,472390379434938369,"Que a Kimberly la dejen ir, por favor y gracias.",es,-66.879189,10.48801,0.0,0.0,0.0,0.0,0.0,0.0,0.293337,0.432246,0.528523,0.447481,0.553607,0.688987
2,422031463429984256,it hurts right ? :D,en,67.082199,24.9056,1.0,1.0,1.0,1.0,1.0,1.0,0.387055,0.544625,0.583162,0.392447,0.559619,0.588813


In [7]:
# Cache for storing results
cache = {}

# Function to get country from coordinates with caching
def get_country_cached(geo_x, geo_y):
    # Instantiate geolocator inside the function to avoid pickling issues
    geolocator = Nominatim(user_agent="geo_locator")
    key = (geo_y, geo_x)  # Use latitude and longitude as key
    
    if key in cache:
        return cache[key]  # Return cached result if available
    else:
        try:
            # Reverse geocode to get the country
            location = geolocator.reverse(key, language='en')
            if location and 'country' in location.raw['address']:
                country = location.raw['address']['country']
            else:
                country = "Unknown"
        except Exception as e:
            country = "Unknown"
        # Store result in cache
        cache[key] = country
        return country

In [8]:
# This code takes approximately 40-50 mins to complete (for 10,000 samples)
# Enable progress bar for parallel processing
tqdm.pandas()

# Apply the function in parallel to infer country
df['country'] = Parallel(n_jobs=-1)(
    delayed(get_country_cached)(row['geo_x'], row['geo_y']) for _, row in tqdm(df.iterrows(), total=len(df))
)

100%|█████████████████████████████| 2331326/2331326 [34:57:30<00:00, 18.52it/s]


In [9]:
# Count the occurrences of each country
country_counts = df['country'].value_counts()

# Display the counts
print("Country Counts:")
print(country_counts)

Country Counts:
Unknown        2331317
Pakistan             3
Egypt                2
Venezuela            2
Lebanon              1
Philippines          1
Name: country, dtype: int64


In [10]:
# Filter rows where the country is "Unknown"
unknown_locations = df[df['country'] == "Unknown"]

# Display the filtered DataFrame
print(f"Number of observations with 'Unknown' country: {len(unknown_locations)}")
print(unknown_locations.head())

# Optionally, save the filtered observations to a CSV file
unknown_locations.to_csv("unknown_locations.csv", index=False)
print("Observations with 'Unknown' country saved to 'unknown_locations.csv'")

Number of observations with 'Unknown' country: 2331317
              tweetid                                               text lang  \
0  388328898662268928                             talking abt my case ☺️   en   
1  472390379434938369   Que a Kimberly la dejen ir, por favor y gracias.   es   
2  422031463429984256                                it hurts right ? :D   en   
3  469163284034519040                  (وتحبون المال حبا جما) [الفجر:20]   ar   
4  413132210666242048  La inflación anualizada saltó a 57% y en alime...   es   

       geo_x      geo_y  post7geo10_true  post7geo30_true  post7geo50_true  \
0  35.494419  33.888939              0.0              0.0              0.0   
1 -66.879189  10.488010              0.0              0.0              0.0   
2  67.082199  24.905600              1.0              1.0              1.0   
3  35.833328  32.500000              0.0              1.0              1.0   
4 -66.879189  10.488010              0.0              1.0           

In [11]:
# Calculate min and max for geo_x and geo_y
geo_x_min, geo_x_max = df['geo_x'].min(), df['geo_x'].max()
geo_y_min, geo_y_max = df['geo_y'].min(), df['geo_y'].max()

# Print results
print(f"geo_x: min={geo_x_min}, max={geo_x_max}")
print(f"geo_y: min={geo_y_min}, max={geo_y_max}")

geo_x: min=-110.9936981201172, max=126.56413269042967
geo_y: min=-22.999980926513672, max=50.95496368408203


In [12]:
# Save the updated DataFrame to a CSV 
df.to_csv(args.fout, index=False)