In [None]:
import pandas


In [None]:
from shapely.geometry import Point, Polygon
green_land_polygon = [(78.823,-72.484),(81.071,-64.823),(82.402,-59.148),(85.022,-56.529),(84.738,-6.328),(67.276,-24.837),(55.970,-45.616),(70.463,-61.593),(77.273,-74.602),(78.823,-72.484)]
def get_filtered_df(df, lat_col, lon_col, polygon):
    poly = Polygon([(lon, lat) for lat, lon in polygon])  # shapely uses (x=lon, y=lat)
    mask = df.apply(lambda row: poly.contains(Point(row[lon_col], row[lat_col])), axis=1)
    return df[mask]

def get_bounded_df(df, lat_col, lon_col, lat_min, lat_max, lon_min, lon_max):
    bounded_df = df[(df[lat_col] >= lat_min) & (df[lat_col] <= lat_max) & (df[lon_col] >= lon_min) & (df[lon_col] <= lon_max)]
    return bounded_df

def get_osm_processed_df(df):
    df['osm_name'] = df.apply(lambda row: get_name(row, 'osm_name'), axis=1)
    return df

def get_name(row, col_name):
    name = row[col_name]
    if pandas.isna(name):
        return name
    if '"name"=>"' in name:
        name = name.split('"name"=>"')[1].split('"')[0]
    elif '"name:en"=>"' in name:
        name = name.split('"name:en"=>"')[1].split('"')[0]
    elif '"name:fr"=>"' in name:
        name = name.split('"name:fr"=>"')[1].split('"')[0]
    elif '"name:da"=>"' in name:
        name = name.split('"name:da"=>"')[1].split('"')[0]
    elif '"name:de"=>"' in name:
        name = name.split('"name:de"=>"')[1].split('"')[0]
    elif '"name:"=>"' in name:
        name = name.split('"name:"=>"')[1].split('"')[0]
    elif 'name:' in name:
        print("Unexpected name format:", name)
        name = name
    else:
        name = None
    return name

def get_fsq_processed_df(df):   
    df['fsq_name'] = df['fsq_name'].apply(lambda x: x if (not pandas.isna(x) and len(x) > 10) else None)
    return df

In [None]:
osm_data = pandas.read_csv("../../data/osm_green_land.csv")
osm_filtered = get_filtered_df(osm_data, 'osm_latitude', 'osm_longitude', green_land_polygon)
osm_filtered = get_osm_processed_df(osm_filtered)


In [None]:
fsq_data = pandas.read_csv("../../data/fsq_green_land.csv")
fsq_filtered = get_filtered_df(fsq_data, 'fsq_latitude', 'fsq_longitude', green_land_polygon)
# fsq_filtered = get_fsq_processed_df(fsq_filtered)

In [None]:
fsq_osm_data = pandas.read_csv("../../data/fsq_osm_green_land.csv", low_memory=False)
fsq_osm_data = fsq_osm_data[fsq_osm_data['fsq_osm_name_similarity_score'] > 0.5]
fsq_osm_filtered = get_filtered_df(fsq_osm_data, 'fsq_latitude', 'fsq_longitude', green_land_polygon)

In [None]:
population_data = pandas.read_csv("../../data/population.csv")
population_data_filtered = get_filtered_df(population_data, 'Y', 'X', green_land_polygon)

In [None]:

def show_filtered_data(df,cols,limit=5,label=""):
    df = df.dropna(subset=cols)
    print(f"\\begin{{table}}[h]")
    print(f"\\caption{{Greenland Sample {label.replace('_', ' ')}}}")
    print(f"\\label{{tab:greenland_sample_{label}}}")
    print(f"\\begin{{tabular}}{{| {' | '.join(['c'] * (len(cols)+1))} |}}")
    print(f"\\hline")
    # df = df.sort_values(by=cols[0])
    print(' & '.join(cols).replace('_', '\\_') + ' & Note\\\\ \\hline')
    counter = 0
    for row in df.itertuples():
        check = False
        for col in cols:
            if pandas.isna(getattr(row, col)):
                check = True
                break
            # if its not in english:
            if isinstance(getattr(row, col), str) and not all(ord(c) < 128 for c in getattr(row, col)):
                check = True
                break
        if check:
            continue
        else:
            for col in cols[:-1]:
                print(f"{getattr(row, col)}".replace('_', '\\_').replace('&', '\\&'), end=" & ")
            print(f"{getattr(row, cols[-1])}".replace('_', '\\_').replace('&', '\\&'), end=" & ")
            print(" \\\\ \\hline")
        counter += 1
        if counter >= limit:
            break
    print(f"\\end{{tabular}}")
    print(f"\\end{{table}}")
l = 50
show_filtered_data(fsq_filtered, ['fsq_name', 'fsq_category_labels'], limit=l, label="fsq")
show_filtered_data(osm_filtered, ['osm_name','osm_class','osm_type'], limit=l, label="osm")
show_filtered_data(fsq_osm_filtered, ['fsq_name']+['osm_name','osm_type'], limit=l, label="fsq_osm")



In [None]:
cols = ['fsq_name', 'fsq_category_labels', 'fsq_latitude','fsq_longitude']
fsq_filtered[cols].head(100)


In [None]:
cols = ['osm_name','osm_class','osm_type', 'osm_latitude','osm_longitude']
# cols = osm_filtered.columns.tolist()
osm_filtered[cols].head(100)

In [None]:
cols = ['fsq_name',  'osm_name','osm_type', 'fsq_latitude','fsq_longitude','osm_latitude','osm_longitude', 'fsq_category_labels']
cols = fsq_osm_filtered.columns.tolist()
data = fsq_osm_filtered[cols].head(100)
data
# print(','.join(cols))
# for row in data.itertuples():
#     print(f"{row.fsq_name}\t & {row.osm_name}\t & {row.osm_type}\t & \\\\ \hline\n{row.fsq_latitude},{row.fsq_longitude}\n{row.osm_latitude},{row.osm_longitude}\n{row.fsq_category_labels}\n")

In [None]:
osm_filtered.to_csv("../../data/osm_green_land_filtered.csv", index=False)
fsq_filtered.to_csv("../../data/fsq_green_land_filtered.csv", index=False)
fsq_osm_filtered.to_csv("../../data/fsq_osm_green_land_filtered.csv", index=False)
population_data_filtered.to_csv("../../data/population_green_land_filtered.csv", index=False)