In [1]:
# This notebook aims to Apply KNN to the Weather Station Data and take the average of those clusters to 
# fill in the missing values from the weather station data

In [9]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

In [10]:
data = pd.read_csv("weather_station_data_updated.csv")

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453138 entries, 0 to 453137
Data columns (total 12 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tavg       342866 non-null  float64
 1   tmin       354263 non-null  float64
 2   tmax       354009 non-null  float64
 3   prcp       283351 non-null  float64
 4   wspd       295828 non-null  float64
 5   pres       202943 non-null  float64
 6   tsun       2229 non-null    float64
 7   latitude   453138 non-null  float64
 8   longitude  453138 non-null  float64
 9   elevation  453138 non-null  float64
 10  state      453103 non-null  object 
 11  time       453138 non-null  object 
dtypes: float64(10), object(2)
memory usage: 41.5+ MB


In [18]:
# Create a copy of the original DataFrame
data_imputed = data.copy()

# Separate rows with and without missing 'pressure' values
non_missing_data = data.dropna(subset=['pres'])
missing_data = data[data['pres'].isna()]

# Define features (latitude and longitude) and target (pressure) for training the KNN model
X_non_missing = non_missing_data[['latitude', 'longitude']]
y_non_missing = non_missing_data['pres']

# Use KNN to find nearest neighbors based on latitude and longitude
knn_imputer = KNeighborsRegressor(n_neighbors=4, weights='distance')
knn_imputer.fit(X_non_missing, y_non_missing)

# Predict (impute) missing pressure values
X_missing = missing_data[['latitude', 'longitude']]
imputed_values = knn_imputer.predict(X_missing)

# Fill missing 'pressure' values in the new DataFrame
data_imputed.loc[data_imputed['pres'].isna(), 'pres'] = imputed_values




In [20]:
data_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453138 entries, 0 to 453137
Data columns (total 12 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tavg       342866 non-null  float64
 1   tmin       354263 non-null  float64
 2   tmax       354009 non-null  float64
 3   prcp       283351 non-null  float64
 4   wspd       295828 non-null  float64
 5   pres       453138 non-null  float64
 6   tsun       2229 non-null    float64
 7   latitude   453138 non-null  float64
 8   longitude  453138 non-null  float64
 9   elevation  453138 non-null  float64
 10  state      453103 non-null  object 
 11  time       453138 non-null  object 
dtypes: float64(10), object(2)
memory usage: 41.5+ MB


In [19]:
data_imputed.to_csv('ws_4n.csv', index = False)