In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import datetime as dt


start_time = time.time()


In [2]:
cand_dataset = pd.read_csv("candidate_center_withlatitudeandlongitude.csv")

In [3]:
cand_dataset

Unnamed: 0,PX_ID,CTR_CD,CTR_ID,CTR_TY,PRIMARY_ZIP,latitude,longitude
0,1600263,NYNS,447,TX1,11030,40.7934,-73.6888
1,1652951,COUC,91,TX1,80045,39.7467,-104.8384
2,1634224,SCMU,595,TX1,29425,32.7862,-79.9471
3,1641765,OKMD,493,TX1,73104,35.4794,-97.5017
4,1617078,OKBC,487,TX1,73112,35.5184,-97.5746
...,...,...,...,...,...,...,...
58281,1655407,TXTX,675,TX1,75246,32.7948,-96.7697
58282,1626066,GAEH,139,TX1,30329,33.8236,-84.3214
58283,1643766,FLSL,128,TX1,32224,30.3031,-81.4404
58284,1585067,TXTC,671,TX1,77030,29.7041,-95.4010


In [4]:
cand_dataset = cand_dataset.rename(columns={'latitude': 'Can_lat', 'longitude': 'Can_long'})


In [5]:
cand_dataset

Unnamed: 0,PX_ID,CTR_CD,CTR_ID,CTR_TY,PRIMARY_ZIP,Can_lat,Can_long
0,1600263,NYNS,447,TX1,11030,40.7934,-73.6888
1,1652951,COUC,91,TX1,80045,39.7467,-104.8384
2,1634224,SCMU,595,TX1,29425,32.7862,-79.9471
3,1641765,OKMD,493,TX1,73104,35.4794,-97.5017
4,1617078,OKBC,487,TX1,73112,35.5184,-97.5746
...,...,...,...,...,...,...,...
58281,1655407,TXTX,675,TX1,75246,32.7948,-96.7697
58282,1626066,GAEH,139,TX1,30329,33.8236,-84.3214
58283,1643766,FLSL,128,TX1,32224,30.3031,-81.4404
58284,1585067,TXTC,671,TX1,77030,29.7041,-95.4010


In [6]:
print(cand_dataset['Can_lat'].isna().sum())
print(cand_dataset['Can_long'].isna().sum())


0
0


In [7]:
missing_rows = cand_dataset[cand_dataset['Can_lat'].isna() | cand_dataset['Can_long'].isna()]
print("Number of rows missing either Can_lat or Can_long:", len(missing_rows))


Number of rows missing either Can_lat or Can_long: 0


In [8]:
donor_dataset = pd.read_csv("donor_hospital_withlatitudeandlongitude.csv")

In [9]:
donor_dataset

Unnamed: 0,DONOR_ID,CTR_CD,CTR_ID,CTR_TY,PRIMARY_ZIP,latitude,longitude
0,504553.0,WALC,730,OP1,98006.0,47.5614,-122.1552
1,507628.0,TXGC,637,OP1,77054.0,29.6852,-95.4017
2,507859.0,WIDN,832,OP1,53233.0,43.0407,-87.9357
3,507734.0,MAOB,302,OP1,2451.0,42.3986,-71.2451
4,507747.0,TXSB,661,OP1,75231.0,32.8756,-96.7495
...,...,...,...,...,...,...,...
107530,716945.0,KYDA,207,OP1,40223.0,38.2651,-85.5582
107531,716901.0,NYRT,450,OP1,11101.0,40.7446,-73.9345
107532,716751.0,NYRT,450,OP1,11101.0,40.7446,-73.9345
107533,717147.0,INOP,200,OP1,46222.0,39.7890,-86.2136


In [10]:
donor_dataset = donor_dataset.rename(columns={'latitude': 'Don_lat', 'longitude': 'Don_long'})

In [11]:
print(donor_dataset['Don_lat'].isna().sum())
print(donor_dataset['Don_long'].isna().sum())

37
37


In [12]:
missing_rows = donor_dataset[donor_dataset['Don_lat'].isna() | donor_dataset['Don_long'].isna()]
print("Number of rows missing either don_lat or don_long:", len(missing_rows))


Number of rows missing either don_lat or don_long: 37


In [13]:
missing_rows 

Unnamed: 0,DONOR_ID,CTR_CD,CTR_ID,CTR_TY,PRIMARY_ZIP,Don_lat,Don_long
165,509916.0,ZCAN,754,FOP,,,
1854,513174.0,ZCAN,754,FOP,,,
4509,517051.0,ZCAN,754,FOP,,,
5724,519674.0,ZCAN,754,FOP,,,
6483,520806.0,ZCAN,754,FOP,,,
12247,532987.0,ZCAN,754,FOP,,,
13515,535809.0,ZCAN,754,FOP,,,
15923,540659.0,ZCAN,754,FOP,,,
16209,541545.0,ZCAN,754,FOP,,,
18340,545679.0,ZCAN,754,FOP,,,


In [14]:
donor_dataset = donor_dataset[donor_dataset['Don_lat'].notna() & donor_dataset['Don_long'].notna()]

In [15]:
print("Remaining rows:", len(donor_dataset))
print("Missing Don_lat:", donor_dataset['Don_lat'].isna().sum())
print("Missing Don_long:", donor_dataset['Don_long'].isna().sum())


Remaining rows: 107498
Missing Don_lat: 0
Missing Don_long: 0


In [16]:
import pandas as pd
import numpy as np

# randomly sample 10 candidates and 10 donors
cand_sample = cand_dataset.sample(10, random_state=42).reset_index(drop=True)
donor_sample = donor_dataset.sample(10, random_state=42).reset_index(drop=True)

pairs = pd.concat([cand_sample, donor_sample], axis=1)

# degrees to radian
pi = np.pi
cand_lat_r = np.radians(pairs["Can_lat"])
cand_lon_r = np.radians(pairs["Can_long"])
don_lat_r  = np.radians(pairs["Don_lat"])
don_lon_r  = np.radians(pairs["Don_long"])

# Haversine formula for distance estimation
lat_diff = don_lat_r - cand_lat_r
lon_diff = don_lon_r - cand_lon_r

a = np.sin(lat_diff / 2)**2 + np.cos(cand_lat_r) * np.cos(don_lat_r) * np.sin(lon_diff / 2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

# Earth radius
radius_nm = 3435.265
pairs["cand_don_distance_nm"] = radius_nm * c

print(pairs[["PX_ID", "DONOR_ID", "Can_lat", "Can_long", "Don_lat", "Don_long", "cand_don_distance_nm"]])


     PX_ID  DONOR_ID    Can_lat    Can_long   Don_lat  Don_long  \
0  1370601  713400.0  30.303100  -81.440400  39.96600 -75.15100   
1  1658842  578488.0  40.744300  -73.978100  39.78900 -86.21360   
2  1539627  620464.0  41.897100  -87.622300  38.61850 -90.25640   
3  1502893  533419.0  40.838200  -73.942000  18.40609 -66.10123   
4  1466379  691294.0  29.704100  -95.401000  39.02780 -94.65580   
5  1190788  696875.0  41.085600  -73.777600  41.44160 -81.54860   
6  1442088  644184.0  18.382981  -66.064729  39.98740 -83.04560   
7  1498761  611440.0  44.968200  -93.242900  36.19010 -86.80530   
8  1426892  528146.0  37.763100 -122.458600  40.70040 -74.40230   
9  1456785  567080.0  38.618500  -90.256400  29.68520 -95.40170   

   cand_don_distance_nm  
0            655.867435  
1            562.228818  
2            230.551542  
3           1404.140878  
4            560.226964  
5            350.758130  
6           1564.703925  
7            601.962807  
8           2210.909600  
9 

In [17]:
# will take a lot of time to run, expand memory

In [None]:
# Create all candidate–donor pairs
cand_don_pairs = cand_dataset.assign(key=1).merge(donor_dataset.assign(key=1), on="key").drop("key", axis=1)


pi = np.pi
cand_don_pairs["cand_lat_r"] = np.radians(cand_don_pairs["Can_lat"])
cand_don_pairs["cand_lon_r"] = np.radians(cand_don_pairs["Can_long"])
cand_don_pairs["don_lat_r"]  = np.radians(cand_don_pairs["Don_lat"])
cand_don_pairs["don_lon_r"]  = np.radians(cand_don_pairs["Don_long"])


lat_diff = cand_don_pairs["don_lat_r"] - cand_don_pairs["cand_lat_r"]
lon_diff = cand_don_pairs["don_lon_r"] - cand_don_pairs["cand_lon_r"]

a = np.sin(lat_diff / 2) ** 2 + np.cos(cand_don_pairs["cand_lat_r"]) * np.cos(cand_don_pairs["don_lat_r"]) * np.sin(lon_diff / 2) ** 2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

radius_nm = 3435.265
cand_don_pairs["cand_don_distance"] = radius_nm * c


print(cand_don_pairs[["PX_ID", "DONOR_ID", "cand_don_distance"]].head())


In [18]:
end_time = time.time()
time_diff = end_time - start_time
time_diff

57.20306134223938