In [1]:
from utils import *
from tqdm.notebook import tqdm
# from dowhy.causal_model import CausalModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from matplotlib.colors import TwoSlopeNorm

import warnings
warnings.filterwarnings("ignore")

res_1km = 1 / 12 / 10

In [2]:
def get_ar_circle(radius):
    if radius>2:
        radius_clip = {3: 2.54, 4:3.53, 5: 4.49, 6: 5.52, 7: 6.52, 8: 7.49, 9: 8.52, 10: 9.49, 11: 10.52, 12: 11.49, 13: 12.49, 14: 13.51, 15: 14.49, 16: 15.52, 17: 16.46, 18: 17.49, 19: 18.49, 20: 19.47, 21: 20.49, 22: 21.49, 23: 22.49, 24: 23.49, 25: 24.43}[radius]
        gdf_circle = gpd.GeoDataFrame({}, geometry=gpd.GeoDataFrame({}, geometry=gpd.points_from_xy([0], [0])).buffer(radius_clip), crs="epsg:4326")
        da_circle = xr.DataArray(np.ones((radius * 2 + 1, radius * 2 + 1)), coords={"y": np.arange(-radius, radius + 1), "x": np.arange(-radius, radius + 1)})\
            .rio.write_crs("epsg:4326")\
            .rio.clip(gdf_circle.geometry, all_touched=True, drop=False)\
            .fillna(0).astype(np.uint8)
    else:
        da_circle = xr.DataArray([[0]], coords={"y": [0], "x": [0]}).astype(np.uint8)
    
    ar_circle = da_circle.values
    return da_circle, ar_circle

In [3]:
def get_ar_ring(radius):
    da_circle1, ar_circle1 = get_ar_circle(radius)
    da_circle2, ar_circle2 = get_ar_circle(radius-5)
    da_circle2 = da_circle2.reindex_like(da_circle1, fill_value=0)
    
    da_circle = da_circle1 - da_circle2
    ar_circle = da_circle.values
    return da_circle, ar_circle

In [None]:
da_circle0, ar_circle0 = get_ar_circle(25)

for radius_ in np.arange(5, 21, 5)[::-1]:
    _da_circle, _ar_circle = get_ar_circle(radius_)
    da_circle0 = da_circle0 + _da_circle.reindex_like(da_circle0).fillna(0)

In [5]:
from sklearn.neighbors import BallTree

def get_nearest(src_points, candidates, k_neighbors=1):
    tree = BallTree(candidates, leaf_size=15, metric='haversine')
    distances, indices = tree.query(src_points, k=k_neighbors)
    distances = distances.transpose()
    indices = indices.transpose()
    closest = indices[0]
    closest_dist = distances[0]
    return (closest, closest_dist)

def nearest_neighbor(left_gdf, right_gdf, return_dist=False):
    left_geom_col = left_gdf.geometry.name
    right_geom_col = right_gdf.geometry.name
    right = right_gdf.copy().reset_index(drop=True)
    left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
    right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
    closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)
    closest_points = right.loc[closest]
    closest_points = closest_points.reset_index(drop=True)
    if return_dist:
        earth_radius = 6371000  # meters
        closest_points['distance'] = dist * earth_radius
    return closest_points

In [None]:
def cal_zonal(
    gdf_conflict_1year, gdf_conflict_lastyear, 
    da_combine_no_coords_luc_lastyear, da_combine_no_coords_luc_currentyear, 
    da_combine_no_coords_npp_lastyear, da_combine_no_coords_npp_currentyear, 
    da_combine_no_coords_road_dis, da_combine_no_coords_boundary_dis,
    da_combine_no_coords_pop, 
):

    dis_to_conflict_lastyear = nearest_neighbor(gdf_conflict_1year, gdf_conflict_lastyear, return_dist=True)

    total_area_ly = (~np.isnan(da_combine_no_coords_luc_lastyear)).sum(dim=["x", "y"])
    total_area_cy = (~np.isnan(da_combine_no_coords_luc_currentyear)).sum(dim=["x", "y"])
    total_pop = da_combine_no_coords_pop.sum(dim=["x", "y"]).values

    luc_ratio_ly, luc_ratio_cy = {}, {}
    for year_lc, da_lc in zip(["ly", "cy"], [da_combine_no_coords_luc_lastyear, da_combine_no_coords_luc_currentyear]):
        for luc_, luc_name in [
            [[12, 14], f"crop_{year_lc}"], 
            [[13], f"built_{year_lc}"],
            [[1, 2, 3, 4, 5],  f"forest_{year_lc}"],
            [[6, 7], f"shrubland_{year_lc}"],
            [[10],  f"grass_{year_lc}"],
        ]:
            if year_lc == "ly":
                luc_ratio_ly[luc_name] = ((da_lc.isin(luc_)).sum(dim=["x", "y"]) / total_area_ly).values
            elif year_lc == "cy":
                luc_ratio_cy[luc_name] = ((da_lc.isin(luc_)).sum(dim=["x", "y"]) / total_area_cy).values

    gdf_conflict_1year_zonal = gdf_conflict_1year.copy()\
        .assign(
            pop=total_pop,
        )\
        .assign(dis_c_ly=dis_to_conflict_lastyear["distance"])\
        .assign(
            crop_ly=luc_ratio_ly["crop_ly"],
            crop_cy=luc_ratio_cy["crop_cy"],
            forest_ly=luc_ratio_ly["forest_ly"], 
            forest_cy=luc_ratio_cy["forest_cy"],
            grass_ly=luc_ratio_ly["grass_ly"],
            grass_cy=luc_ratio_cy["grass_cy"],
            shrubland_ly=luc_ratio_ly["shrubland_ly"],
            shrubland_cy=luc_ratio_cy["shrubland_cy"],
            built_ly=luc_ratio_ly["built_ly"],
            built_cy=luc_ratio_cy["built_cy"],
        )\
        .assign(
            crop_change=lambda _df: _df["crop_cy"] - _df["crop_ly"],
            forest_change=lambda _df: _df["forest_cy"] - _df["forest_ly"],
            grass_change=lambda _df: _df["grass_cy"] - _df["grass_ly"],
            shrubland_change=lambda _df: _df["shrubland_cy"] - _df["shrubland_ly"],
        )\
        .assign(
            dis2road=da_combine_no_coords_road_dis.mean(dim=["x", "y"]).values,
            dis2bound=da_combine_no_coords_boundary_dis.mean(dim=["x", "y"]).values)\
        .assign(npp_cy=da_combine_no_coords_npp_currentyear.mean(dim=["x", "y"]).values)\
        .assign(crop_npp_ly=xr.where(da_combine_no_coords_luc_lastyear == 12, da_combine_no_coords_npp_lastyear, np.nan).mean(dim=["x", "y"]).values)\
        .assign(crop_npp_cy=xr.where(da_combine_no_coords_luc_currentyear == 12, da_combine_no_coords_npp_currentyear, np.nan).mean(dim=["x", "y"]).values)\
        .assign(crop_npp_change=lambda _df: _df["crop_npp_cy"] - _df["crop_npp_ly"])\
        .assign(forest_npp_ly=xr.where(da_combine_no_coords_luc_lastyear.isin([1, 2, 3, 4, 5]), da_combine_no_coords_npp_lastyear, np.nan).mean(dim=["x", "y"]).values)\
        .assign(forest_npp_cy=xr.where(da_combine_no_coords_luc_currentyear.isin([1, 2, 3, 4, 5]), da_combine_no_coords_npp_currentyear, np.nan).mean(dim=["x", "y"]).values)\
        .assign(forest_npp_change=lambda _df: _df["forest_npp_cy"] - _df["forest_npp_ly"])\
        .assign(grass_npp_ly=xr.where(da_combine_no_coords_luc_lastyear.isin([10]), da_combine_no_coords_npp_lastyear, np.nan).mean(dim=["x", "y"]).values)\
        .assign(grass_npp_cy=xr.where(da_combine_no_coords_luc_currentyear.isin([10]), da_combine_no_coords_npp_currentyear, np.nan).mean(dim=["x", "y"]).values)\
        .assign(grass_npp_change=lambda _df: _df["grass_npp_cy"] - _df["grass_npp_ly"])
        
    return gdf_conflict_1year_zonal

In [None]:
def clip_sample(da_, da_circle):
    return xr.where(da_circle==1, da_.sel(x=da_circle.x, y=da_circle.y), np.nan)

def sel_sample_1country(gdf_conflict_1year_zonal, gdf_non_conflict_1year_zonal, sel_country):
    gdf_conflict_clip = gdf_conflict_1year_zonal.clip(gdf_world.query('name_long == @sel_country'))
    gdf_non_conflict_clip = gdf_non_conflict_1year_zonal.clip(gdf_world.query('name_long == @sel_country').buffer(res_1km*100))
    df_sample_sel = pd.concat([gdf_conflict_clip, gdf_non_conflict_clip]).reset_index(drop=True).assign(country='sel_country')
    return df_sample_sel

def match_conflict_to_non_con(df_sample):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_sample[['pop', 'dis_c_ly', 'crop_ly', 'built_ly', 'dis2road', 'dis2bound']])
    logistic_model = LogisticRegression(solver='liblinear', random_state=0)
    logistic_model.fit(X_scaled, df_sample['c'])
    propensity_scores = logistic_model.predict_proba(X_scaled)[:, 1]

    df_sample = df_sample.assign(ps=propensity_scores)
    df_c_match = df_sample.query('c == 1')

    nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
    nn.fit(propensity_scores[df_sample['c'] == 0].reshape(-1, 1)) 

    matched_indices = []
    for idx in df_sample.query('c == 1').index:
        treated_score = propensity_scores[idx]
        nearest_idx = nn.kneighbors([[treated_score]])[1][0][0]
        matched_indices.append(df_sample[df_sample['c'] == 0].index[nearest_idx])

    matched_df = df_sample.loc[matched_indices]
    df_matched = df_c_match.reset_index(drop=True).join(matched_df.reset_index(drop=True), rsuffix="_n")
    return df_matched

In [8]:
def match_df(df_zonal_c, df_zonal_non_c, df_index_matched):
    df_matched_m = df_zonal_c\
        .set_index("idx")\
        .loc[df_index_matched["idx"]]\
        .reset_index(drop=True)\
        .join(
            df_zonal_non_c\
                .set_index("idx")\
                .loc[df_index_matched["idx_n"]]\
                .reset_index(drop=True), 
            rsuffix="_n"\
        )
    df_matched_m = pd.concat([df_matched_m, df_index_matched], axis=1)
    return df_matched_m

In [9]:
def cal_diff_from_matched(df_matched):
    diff_col = ['pop', 'crop_change', 'forest_change', 'grass_change', 'crop_npp_change', 'forest_npp_change', 'grass_npp_change']
    diff_data = {}
    keep_cols = ["x", "y"]
    for col_ in keep_cols:
        diff_data[col_] = df_matched[col_]
    for col_ in diff_col:
        diff_data[col_] = df_matched[col_] - df_matched[f"{col_}_n"]
    df_diff = pd.DataFrame(diff_data)
    # df_diff = df_diff.assign(country=sel_country).assign(year=year)
    return df_diff


In [10]:
class Data1year:
    def __init__(self, year, psm_radius=15):
        self.year = year
        self.psm_radius = psm_radius
        self.gdf_conflict_1year = gpd.read_file(path_data / f"PSM/sample_point/conflict_{self.year}.shp")
        self.gdf_conflict_lastyear = gpd.read_file(path_data / f"PSM/sample_point/conflict_{self.year-1}.shp")
        
        self.gdf_non_conflict_1year = gpd.read_file(path_data / f"PSM/sample_point/non_conflict_{self.year}.shp")
        
        self.nc_files = ["luc_lastyear", "luc_currentyear", "road_dis", "boundary_dis", "npp_lastyear", "npp_currentyear", "pop",]
        
        for name_ in self.nc_files:
            setattr(self, f"da_conflict_{name_}", xr.open_dataarray(path_data / f"PSM25km/conflict_sample/{self.year}_{name_}.nc"))
            setattr(self, f"da_non_conflict_{name_}", xr.open_dataarray(path_data / f"PSM25km/non_conflict_sample/{self.year}_{name_}.nc"))
    
        self.cal_circle(self.psm_radius)
        self.cal_90_countries()
        self.match_logistic()
        
        self.df_matched_circle = match_df(self.zonal_conflict_circle_15km, self.zonal_non_conflict_circle_15km, self.match_idx)\
            .assign(year=self.year)
        self.df_matched_circle_diff = cal_diff_from_matched(self.df_matched_circle)\
            .assign(year=self.year)
        
    def process_nc_files(self, prefix, da_mask):
        for name_ in self.nc_files:
            da_ = getattr(self, f"{prefix}_{name_}")
            setattr(self, f"da_combine_no_coords_{name_}", clip_sample(da_, da_mask))
        
    def cal_circle(self, psm_radius):
        self.da_circle = get_ar_circle(psm_radius)[0]
        self.process_nc_files("da_conflict", self.da_circle)
       
        self.zonal_conflict_circle_15km = cal_zonal(
            self.gdf_conflict_1year, self.gdf_conflict_lastyear, 
            self.da_combine_no_coords_luc_lastyear, self.da_combine_no_coords_luc_currentyear, 
            self.da_combine_no_coords_npp_lastyear, self.da_combine_no_coords_npp_currentyear, 
            self.da_combine_no_coords_road_dis, self.da_combine_no_coords_boundary_dis,
            self.da_combine_no_coords_pop, 
        ).assign(c=1)
            
        self.process_nc_files("da_non_conflict", self.da_circle)
        self.zonal_non_conflict_circle_15km = cal_zonal(
            self.gdf_non_conflict_1year, self.gdf_conflict_lastyear, 
            self.da_combine_no_coords_luc_lastyear, self.da_combine_no_coords_luc_currentyear, 
            self.da_combine_no_coords_npp_lastyear, self.da_combine_no_coords_npp_currentyear, 
            self.da_combine_no_coords_road_dis, self.da_combine_no_coords_boundary_dis,
            self.da_combine_no_coords_pop, 
        ).assign(c=0)
        
    def cal_ring(self, ring_radius):
        self.da_ring = get_ar_ring(ring_radius)[0]
        
        self.process_nc_files("da_conflict", self.da_ring)
        _zonal_conflict_ring = cal_zonal(
            self.gdf_conflict_1year, self.gdf_conflict_lastyear, 
            self.da_combine_no_coords_luc_lastyear, self.da_combine_no_coords_luc_currentyear, 
            self.da_combine_no_coords_npp_lastyear, self.da_combine_no_coords_npp_currentyear, 
            self.da_combine_no_coords_road_dis, self.da_combine_no_coords_boundary_dis,
            self.da_combine_no_coords_pop, 
        )\
            .assign(c=1)\
            .assign(ring=ring_radius)
            
        setattr(self, f"zonal_conflict_ring_{ring_radius}km", _zonal_conflict_ring)
        
        self.process_nc_files("da_non_conflict", self.da_ring)
        _zonal_non_conflict_ring = cal_zonal(
            self.gdf_non_conflict_1year, self.gdf_conflict_lastyear, 
            self.da_combine_no_coords_luc_lastyear, self.da_combine_no_coords_luc_currentyear, 
            self.da_combine_no_coords_npp_lastyear, self.da_combine_no_coords_npp_currentyear, 
            self.da_combine_no_coords_road_dis, self.da_combine_no_coords_boundary_dis,
            self.da_combine_no_coords_pop, 
        )\
            .assign(c=0)\
            .assign(ring=ring_radius)
            
        setattr(self, f"zonal_non_conflict_ring_{ring_radius}km", _zonal_non_conflict_ring)

        _df_matched_ring = match_df(_zonal_conflict_ring, _zonal_non_conflict_ring, self.match_idx)\
            .assign(year=self.year)
        setattr(self, f"df_matched_ring_{ring_radius}km", _df_matched_ring)
        setattr(self, f"df_matched_ring", _df_matched_ring)
        
        _df_diff_ring = cal_diff_from_matched(_df_matched_ring)\
            .assign(year=self.year)
        setattr(self, f"df_diff_ring_{ring_radius}km", _df_diff_ring)
        setattr(self, f"df_diff_ring", _df_diff_ring)
            
    def cal_90_countries(self):
        df_conflict_count = gpd.sjoin(self.zonal_conflict_circle_15km, gdf_world[['name_long', "geometry"]]).groupby("name_long")["idx"].count().sort_values(ascending=False)
        count_all = df_conflict_count.sum()
        self.sel_countries = df_conflict_count[(df_conflict_count.cumsum() / count_all) < 0.9].index
 
    def match_logistic(self):
        _df_matched_lst = []
        for sel_country in self.sel_countries:
            df_sample = sel_sample_1country(self.zonal_conflict_circle_15km, self.zonal_non_conflict_circle_15km, sel_country)
            df_matched = match_conflict_to_non_con(df_sample).assign(country=sel_country).assign(year=self.year)
            _df_matched_lst.append(df_matched[["idx", "idx_n", "country"]])
        self.match_idx = pd.concat(_df_matched_lst).reset_index(drop=True)

In [None]:
(path_data / f"ring").mkdir(exist_ok=True, parents=True)
for year in range(2002, 2024):
    data1 = Data1year(year)
    data1.df_matched_circle.to_csv(path_data / f"ring/df_matched_circle_{data1.year}.csv")
    data1.df_matched_circle_diff.to_csv(path_data / f"ring/df_diff_circle_{data1.year}.csv")

    for ring_radius in np.arange(5, 26, 5):
        data1.cal_ring(ring_radius)
        data1.df_matched_ring.to_csv(path_data / f"ring/df_matched_ring_{ring_radius}km_{data1.year}.csv")
        data1.df_diff_ring.to_csv(path_data / f"ring/df_diff_ring_{ring_radius}km_{data1.year}.csv")