## Create Data Generating Process (DGP)

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors

# Set random seed for reproducibility
np.random.seed(0)

# Generate coordinates for 50 locations for y and x1
num_locations_y_x1 = 50
coordinates_y_x1 = np.random.rand(num_locations_y_x1, 2) * 100

# Generate coordinates for 50 locations for x2
num_locations_x2 = 50
coordinates_x2 = np.random.rand(num_locations_x2, 2) * 100

# Generate x1 variables (predictor)
X1 = np.random.rand(num_locations_y_x1, 1) * 10

# Generate x2 variables (predictor)
X2 = np.random.rand(num_locations_x2, 1) * 10

# Generate true coefficients for y
true_beta_x1 = 2
true_beta_x2 = 4

# Generate noise
noise = np.random.normal(0, 1, num_locations_y_x1)

# Generate y using both x1 and x2
# Interpolate x2 values at y locations (using nearest neighbor for simplicity)
nbrs_x2_y = NearestNeighbors(n_neighbors=1).fit(coordinates_x2)
_, nearest_x2_indices = nbrs_x2_y.kneighbors(coordinates_y_x1)
X2_at_y_locations = X2[nearest_x2_indices.flatten()]

# Generate y
y = (X1.flatten() * true_beta_x1) + (X2_at_y_locations.flatten() * true_beta_x2) + noise

# Create DataFrames to store the data
data_y_x1 = pd.DataFrame({
    'x1': X1.flatten(),
    'y': y,
    'latitude': coordinates_y_x1[:, 0],
    'longitude': coordinates_y_x1[:, 1]
})

data_x2 = pd.DataFrame({
    'x2': X2.flatten(),
    'latitude': coordinates_x2[:, 0],
    'longitude': coordinates_x2[:, 1]
})

- data_y_x1 are on the same support and it represents housing data. Where y is the price, and x1 is the number of bedrooms

- data_x2 represents POI data, and it is simulated at a different support from houses. 

In [2]:
data_y_x1.head()

Unnamed: 0,x1,y,latitude,longitude
0,3.117959,8.950635,54.88135,71.518937
1,6.963435,44.105596,60.276338,54.488318
2,3.777518,7.614906,42.36548,64.589411
3,1.796037,38.414362,43.758721,89.1773
4,0.246787,38.169319,96.366276,38.344152


In [3]:
data_x2.head()

Unnamed: 0,x2,latitude,longitude
0,3.556127,67.781654,27.000797
1,9.404319,73.519402,96.218855
2,7.653253,24.875314,57.615733
3,7.486636,59.204193,57.225191
4,9.037197,22.308163,95.274901


### Build change of support into GWR from scratch 

<h5> Basic Idea <h5/>
 
- Establish a borrowing threshold (bandwidth) - 10 nearest neighbors

- Select X nearby to each location j of y (with knn). while iterating over each calibration point i

- Using the same threshold, weight each collection of X but use the distances based on i.
  
- Calibrate linear regression on weighted X and weighted y. 

In [4]:
# Define number of neighbors
k_neighbors = 10

# Fit nearest neighbors model for y and x1
nbrs_y_x1 = NearestNeighbors(n_neighbors=k_neighbors).fit(coordinates_y_x1)
distances_y_x1, indices_y_x1 = nbrs_y_x1.kneighbors(coordinates_y_x1)

# Fit nearest neighbors model for x2
nbrs_x2 = NearestNeighbors(n_neighbors=k_neighbors).fit(coordinates_x2)
distances_x2, indices_x2 = nbrs_x2.kneighbors(coordinates_y_x1)

# Gaussian kernel function for weights
def gaussian_kernel(distances, bandwidth):
    return np.exp(-0.5 * (distances / bandwidth) ** 2)

# Define bandwidth
bandwidth = 10

# Compute weights for each location based on distances
weights_y_x1 = gaussian_kernel(distances_y_x1, bandwidth)
weights_x2 = gaussian_kernel(distances_x2, bandwidth)

# Initialize arrays to store smoothed values
X1_smoothed = np.zeros(num_locations_y_x1)
X2_smoothed = np.zeros(num_locations_y_x1)
y_smoothed = np.zeros(num_locations_y_x1)

# Smooth values for each location
for i in range(num_locations_y_x1):
    # Smooth x1 and y using their respective weights
    neighbor_indices_y_x1 = indices_y_x1[i]
    X1_neighbors = X1[neighbor_indices_y_x1].flatten()
    y_neighbors = y[neighbor_indices_y_x1]
    
    W_y_x1 = weights_y_x1[i]
    X1_smoothed[i] = np.average(X1_neighbors, weights=W_y_x1)
    y_smoothed[i] = np.average(y_neighbors, weights=W_y_x1)
    
    # Smooth x2 using its weights
    neighbor_indices_x2 = indices_x2[i]
    X2_neighbors = X2[neighbor_indices_x2].flatten()
    
    W_x2 = weights_x2[i]
    X2_smoothed[i] = np.average(X2_neighbors, weights=W_x2)

# Create a DataFrame to store the smoothed data
smoothed_data = pd.DataFrame({
    'x1_smoothed': X1_smoothed,
    'x2_smoothed': X2_smoothed,
    'y_smoothed': y_smoothed,
    'latitude': coordinates_y_x1[:, 0],
    'longitude': coordinates_y_x1[:, 1]
})

print(smoothed_data.head())

# Perform linear regression on the smoothed data
model = LinearRegression()
model.fit(smoothed_data[['x1_smoothed', 'x2_smoothed']], smoothed_data['y_smoothed'])

# Extract estimated coefficients
estimated_beta = model.coef_
print(f"True coefficients: {true_beta_x1, true_beta_x2}")
print("Estimated coefficients:", estimated_beta)

   x1_smoothed  x2_smoothed  y_smoothed   latitude  longitude
0     3.775710     3.052816   17.670302  54.881350  71.518937
1     5.250277     4.694772   32.780088  60.276338  54.488318
2     3.842698     2.538297   17.166403  42.365480  64.589411
3     3.757006     5.039907   33.430279  43.758721  89.177300
4     2.009947     8.041933   40.639347  96.366276  38.344152
True coefficients: (2, 4)
Estimated coefficients: [2.4725273  4.38312055]
