In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import matplotlib.pyplot as plt
from joblib import parallel_backend

In [None]:
df = pd.read_csv('data-p4/Flights1_2019_1.csv')

Unnamed: 0,YEAR,DAY_OF_WEEK,FL_DATE,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST_CITY_NAME,DEST_STATE_ABR,DEP_DELAY,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,Unnamed: 17
0,2019,6,2019-01-19,13487,1348702,31650,"Minneapolis, MN",11193,1119302,33105,"Cincinnati, OH",KY,-10.0,1832.0,-25.0,0.0,0.0,
1,2019,7,2019-01-20,13487,1348702,31650,"Minneapolis, MN",11193,1119302,33105,"Cincinnati, OH",KY,-4.0,1825.0,-37.0,0.0,0.0,
2,2019,1,2019-01-21,13487,1348702,31650,"Minneapolis, MN",11193,1119302,33105,"Cincinnati, OH",KY,-9.0,1845.0,-17.0,0.0,0.0,
3,2019,2,2019-01-22,13487,1348702,31650,"Minneapolis, MN",11193,1119302,33105,"Cincinnati, OH",KY,-4.0,1839.0,-23.0,0.0,0.0,
4,2019,3,2019-01-23,13487,1348702,31650,"Minneapolis, MN",11193,1119302,33105,"Cincinnati, OH",KY,-6.0,1850.0,-12.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583980,2019,3,2019-01-30,14057,1405702,34057,"Portland, OR",13930,1393007,30977,"Chicago, IL",IL,-9.0,1225.0,-27.0,0.0,0.0,
583981,2019,3,2019-01-30,13930,1393007,30977,"Chicago, IL",14908,1490803,32575,"Santa Ana, CA",CA,35.0,2230.0,17.0,17.0,1.0,
583982,2019,3,2019-01-30,14635,1463502,31714,"Fort Myers, FL",13930,1393007,30977,"Chicago, IL",IL,132.0,2035.0,113.0,113.0,1.0,
583983,2019,3,2019-01-30,11618,1161802,31703,"Newark, NJ",13930,1393007,30977,"Chicago, IL",IL,32.0,841.0,37.0,37.0,1.0,


In [None]:
df.dropna(subset=['DEP_DELAY','ARR_DELAY'],inplace=True)

df['DEP_DELAY_scaled'] = 2 * (df['DEP_DELAY'] - df['DEP_DELAY'].min()) / \
                        (df['DEP_DELAY'].max() - df['DEP_DELAY'].min()) - 1
df['ARR_DELAY_scaled'] = 2 * (df['ARR_DELAY'] - df['ARR_DELAY'].min()) / \
                        (df['ARR_DELAY'].max() - df['ARR_DELAY'].min()) - 1

In [None]:
vals = df[['DEP_DELAY_scaled','ARR_DELAY_scaled']].values.astype(np.float32)

In [None]:
batch_size = 25000  # can adjust based on available RAM
n = vals.shape[0]
distances = np.zeros((n, n))

In [None]:
for i in range(0, n, batch_size):
    for j in range(0, n, batch_size):
        print(f"Processing block ({i}:{i+batch_size}, {j}:{j+batch_size})...")
        
        with parallel_backend('threading', n_jobs=-1):
            distances = pairwise_distances(
                vals[i:i+batch_size],
                vals[j:j+batch_size],
                metric='euclidean'
            ).astype(np.float32)
        np.save(f"data-p4/pairwise_distances/distances_{i}_{j}.npy", distances) # change to your path

Processing block (0:25000, 0:25000)...
Processing block (0:25000, 25000:50000)...
Processing block (0:25000, 50000:75000)...
Processing block (0:25000, 75000:100000)...
Processing block (0:25000, 100000:125000)...
Processing block (0:25000, 125000:150000)...
Processing block (0:25000, 150000:175000)...
Processing block (0:25000, 175000:200000)...
Processing block (0:25000, 200000:225000)...
Processing block (0:25000, 225000:250000)...
Processing block (0:25000, 250000:275000)...
Processing block (0:25000, 275000:300000)...
Processing block (0:25000, 300000:325000)...
Processing block (0:25000, 325000:350000)...
Processing block (0:25000, 350000:375000)...
Processing block (0:25000, 375000:400000)...
Processing block (0:25000, 400000:425000)...
Processing block (0:25000, 425000:450000)...
Processing block (0:25000, 450000:475000)...
Processing block (0:25000, 475000:500000)...
Processing block (0:25000, 500000:525000)...
Processing block (0:25000, 525000:550000)...
Processing block (0:2

In [None]:
df['dtap'] = distances.sum(axis=1)

In [None]:
plt.scatter(df.index, df['dtap'])
plt.title("Distance to All Points")
plt.xlabel("Index")
plt.ylabel("DTAP")
plt.show()

In [None]:
df['dtap'].nlargest(10).index # show top 10 points with highest dtap

In [None]:
np.fill_diagonal(distances, np.inf)
df['dtnn'] = np.min(distances, axis=1)

In [None]:
plt.scatter(df.index, df['dtnn'])
plt.title("Distance to Nearest Neighbour")
plt.xlabel("Index")
plt.ylabel("DTNN")
plt.show()

In [None]:
df['dtnn'].nlargest(10).index # show top 10 points with highest dtnn