In [1]:
#!/usr/bin/env python
# # -*- coding: utf-8 -*-
# """
# Created on Sun Jun 23 18:44:53 2024
# @author: Mazhar
# """

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pwd

'/content'

In [4]:
try:
    import geopandas as gpd
    import pandas as pd
    import numpy as np
    from pykrige.ok import OrdinaryKriging
    from sklearn.neighbors import KNeighborsClassifier
    from tqdm import tqdm
    import sqlite3
    import os
    import sys
except ModuleNotFoundError:
    if 'google.colab' in str(get_ipython()):
        !apt install libspatialindex-dev -qq
        !pip install fiona shapely pyproj rtree --quiet
        !pip install geopandas --quiet
        !pip install pykrige --quiet
        !pip install scikit-learn
        !pip install tqdm --quiet
        !pip install pysqlite3 --quiet
        !pip install pandas --quiet
        !pip install numpy --quiet
    else:
        print('geopandas not found, please install via conda in your environment')

libspatialindex-dev is already the newest version (1.9.3-2).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m535.2/535.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m670.9 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pysqlite3 (setup.py) ... [?25l[?25hdone


In [5]:
import geopandas as gpd
import pandas as pd
import numpy as np
from pykrige.ok import OrdinaryKriging
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
import sqlite3
import os
import sys

#### VARS

In [6]:
ZONE = 9

# SAMPLE_FRACTION = 0.05
# SAMPLE_FRACTION = 0.0025
# SAMPLE_FRACTION = 0.0038
SAMPLE_FRACTION = 0.0033

LAYER_NAME = "spatial_Joins"

FILE_NAME = f"/content/drive/MyDrive/IPAUA_Maz/dataset/zone_{ZONE}_v2.csv"

SAVE_FILE_CSV_V2 = f"/content/drive/MyDrive/IPAUA_Maz/dataset/zone_{ZONE}_v2.csv"
SAVE_FILE_GPKG_V2 = f"/content/drive/MyDrive/IPAUA_Maz/dataset/zone_{ZONE}_v2.gpkg"

#### FUNCTIONS

In [7]:
# # HANDLE MISSING VALUES
# def kriging_interpolation_subset(df, column, x_col, y_col, sample_fraction=0.1):
#     # Check if the column is categorical
#     if df[column].dtype == 'object' or df[column].dtype.name == 'category':
#         # Handle categorical data
#         return fill_categorical_values(df, column, x_col, y_col)
#     else:
#         # Handle numerical data using Kriging
#         return fill_numerical_values(df, column, x_col, y_col, sample_fraction)

# HANDLE MISSING VALUES
def kriging_interpolation_subset(df, column, x_col, y_col, sample_fraction=0.1):
    # Subset the data to a manageable size
    sample_df = df.dropna(subset=[column]).sample(frac=sample_fraction, random_state=1)

    known_values = sample_df[column].values
    known_coords = sample_df[[x_col, y_col]].values
    missing_coords = df[df[column].isna()][[x_col, y_col]].values

    # Perform Ordinary Kriging
    kriging = OrdinaryKriging(known_coords[:, 0], known_coords[:, 1], known_values,
                              variogram_model='linear', verbose=False, enable_plotting=False)
    interpolated_values, ss = kriging.execute('points', missing_coords[:, 0], missing_coords[:, 1])

    # Fill the missing values in the DataFrame
    df.loc[df[column].isna(), column] = interpolated_values
    return df

def fill_numerical_values(df, column, x_col, y_col, sample_fraction):
    # Subset the data to a manageable size
    sample_df = df.dropna(subset=[column]).sample(frac=sample_fraction, random_state=1)

    known_values = sample_df[column].values
    known_coords = sample_df[[x_col, y_col]].values
    missing_coords = df[df[column].isna()][[x_col, y_col]].values

    # Perform Ordinary Kriging
    kriging = OrdinaryKriging(known_coords[:, 0], known_coords[:, 1], known_values,
                              variogram_model='linear', verbose=False, enable_plotting=False)
    interpolated_values, ss = kriging.execute('points', missing_coords[:, 0], missing_coords[:, 1])

    # Fill the missing values in the DataFrame
    df.loc[df[column].isna(), column] = interpolated_values
    return df

def fill_categorical_values(df, column, x_col, y_col):
    # Prepare data for KNN
    complete_data = df.dropna(subset=[column])
    missing_data = df[df[column].isna()]

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(complete_data[[x_col, y_col]], complete_data[column])

    # Predict missing values
    predicted_values = knn.predict(missing_data[[x_col, y_col]])

    # Fill the missing values in the DataFrame
    df.loc[df[column].isna(), column] = predicted_values
    return df

# Usage
# df = pd.read_csv('data.csv')
# df = kriging_interpolation_subset(df, 'column', 'Longitude', 'Latitude', sample_fraction=SAMPLE_FRACTION)

In [8]:
# def kriging_interpolation_subset(df, column, x_col, y_col, sample_fraction=SAMPLE_FRACTION):
#     """
#     Apply Kriging interpolation to a subset of a DataFrame.

#     Args:
#         df: The DataFrame containing the data.
#         column: The name of the column to interpolate.
#         x_col: The name of the column containing x-coordinates (e.g., 'Longitude').
#         y_col: The name of the column containing y-coordinates (e.g., 'Latitude').
#         sample_fraction: Fraction of data to use for Kriging (default: 0.0048).

#     Returns:
#         The DataFrame with the interpolated column.
#     """

#     # Handle potential missing column names
#     if x_col not in df.columns:
#         raise ValueError(f"Column '{x_col}' not found in DataFrame.")
#     if y_col not in df.columns:
#         raise ValueError(f"Column '{y_col}' not found in DataFrame.")

#     # Extract relevant data
#     df_subset = df[[column, x_col, y_col]].dropna()  # Drop rows with missing values

#     # Check if there are valid data points for Kriging
#     if df_subset.empty:
#         print(f"Warning: No valid data points for Kriging in column '{column}'. Skipping.")
#         return df

#     # Sample data if needed
#     if sample_fraction < SAMPLE_FRACTION:
#         df_subset = df_subset.sample(frac=sample_fraction)

#     # Prepare data for Kriging
#     known_coords = df_subset[[x_col, y_col]].values
#     known_values = df_subset[column].values
#     missing_coords = df.loc[df[column].isna(), [x_col, y_col]].values

#     # Perform Ordinary Kriging
#     kriging = OrdinaryKriging(known_coords[:, 0], known_coords[:, 1], known_values,
#                               variogram_model='linear', verbose=False, enable_plotting=False)
#     interpolated_values, ss = kriging.execute('points', missing_coords[:, 0], missing_coords[:, 1])

#     # Update DataFrame with interpolated values
#     df.loc[df[column].isna(), column] = interpolated_values

#     return df

#### LOADING .CSV FILE

In [9]:
gdf = pd.read_csv(FILE_NAME)

In [10]:
df = gdf.copy()

In [11]:
df.head(2)

Unnamed: 0,soil_moisture,NDBI,BU,Roughness,Slope,NDVI,LST,UHI,UTFVI,NDWI,...,GHI (kWh/m2),CH4_column_volume_mixing_ratio_dry_air,CO_column_number_density,tropospheric_HCHO_column_number_density,NO2_column_number_density,O3_column_number_density,SO2_column_number_density,Longitude,Latitude,geometry
0,45.511588,-0.03436,-0.088197,52,89.990234,0.20503,32.807213,-0.024135,-0.001928,-0.197248,...,1407.3,,,,,,,9.19237,45.480355,POINT (9.192370471 45.48035535999999)
1,41.233572,-0.03436,-0.088197,52,89.990234,0.20503,32.807213,-0.024135,-0.001928,-0.273191,...,1407.3,,,,,,,9.19255,45.480355,POINT (9.192550134 45.48035536)


#### Drop Columns

In [12]:
# For Zone 4
# df = df.drop(columns=['Unnamed: 0'])

# For Zone 9
# df = df.drop(columns=['Unnamed: 0_left', 'Unnamed: 0_right'])

In [13]:
df.columns.to_list()

['soil_moisture',
 'NDBI',
 'BU',
 'Roughness',
 'Slope',
 'NDVI',
 'LST',
 'UHI',
 'UTFVI',
 'NDWI',
 'SAVI',
 'lulc_classes',
 'Amenity',
 'Land-Use',
 'GHI (kWh/m2)',
 'CH4_column_volume_mixing_ratio_dry_air',
 'CO_column_number_density',
 'tropospheric_HCHO_column_number_density',
 'NO2_column_number_density',
 'O3_column_number_density',
 'SO2_column_number_density',
 'Longitude',
 'Latitude',
 'geometry']

#### Rename Columns

In [14]:
# Rename Cols
df.columns = ['soil_moisture', 'NDBI', 'BU', 'Roughness', 'Slope', 'NDVI', 'LST', 'UHI', 'UTFVI',
 'NDWI', 'SAVI', 'lulc_classes', 'Amenity', 'LandUse', 'GHI', 'CH4',
 'CO', 'HCHO', 'NO2', 'O3', 'SO2', 'Longitude', 'Latitude', 'geometry']

In [15]:
df.head(2)

Unnamed: 0,soil_moisture,NDBI,BU,Roughness,Slope,NDVI,LST,UHI,UTFVI,NDWI,...,GHI,CH4,CO,HCHO,NO2,O3,SO2,Longitude,Latitude,geometry
0,45.511588,-0.03436,-0.088197,52,89.990234,0.20503,32.807213,-0.024135,-0.001928,-0.197248,...,1407.3,,,,,,,9.19237,45.480355,POINT (9.192370471 45.48035535999999)
1,41.233572,-0.03436,-0.088197,52,89.990234,0.20503,32.807213,-0.024135,-0.001928,-0.273191,...,1407.3,,,,,,,9.19255,45.480355,POINT (9.192550134 45.48035536)


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012174 entries, 0 to 1012173
Data columns (total 24 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   soil_moisture  1012174 non-null  float64
 1   NDBI           1012174 non-null  float64
 2   BU             1012174 non-null  float64
 3   Roughness      1012174 non-null  int64  
 4   Slope          1012174 non-null  float64
 5   NDVI           1012174 non-null  float64
 6   LST            1012174 non-null  float64
 7   UHI            1012174 non-null  float64
 8   UTFVI          1012174 non-null  float64
 9   NDWI           1012174 non-null  float64
 10  SAVI           1012174 non-null  float64
 11  lulc_classes   1012174 non-null  int64  
 12  Amenity        1011790 non-null  object 
 13  LandUse        1012174 non-null  object 
 14  GHI            1005960 non-null  float64
 15  CH4            136911 non-null   float64
 16  CO             136911 non-null   float64
 17  HCHO    

In [17]:
# Summarize NaN values for each column
nan_summary = df.isna().sum()
nan_summary

soil_moisture         0
NDBI                  0
BU                    0
Roughness             0
Slope                 0
NDVI                  0
LST                   0
UHI                   0
UTFVI                 0
NDWI                  0
SAVI                  0
lulc_classes          0
Amenity             384
LandUse               0
GHI                6214
CH4              875263
CO               875263
HCHO             875263
NO2              875263
O3               875263
SO2              875263
Longitude             0
Latitude              0
geometry              0
dtype: int64

In [18]:
# # Move the geometry column to the last position
# columns = [col for col in df.columns if col != 'geometry'] + ['geometry']
# df = df[columns]

#### Coumns with Missing Values

In [19]:
cols = df.columns.to_list()
cols = cols[14:-3]
cols

['GHI', 'CH4', 'CO', 'HCHO', 'NO2', 'O3', 'SO2']

#### Fill Missing Values in a loop

In [20]:
# Apply Kriging interpolation to each column
for col in tqdm(cols, colour="blue", desc="Handling Missing Values"):
    df = kriging_interpolation_subset(df, col, 'Longitude', 'Latitude', sample_fraction=SAMPLE_FRACTION)

Handling Missing Values: 100%|[34m██████████[0m| 7/7 [02:09<00:00, 18.49s/it]


#### Fill Categorical Columns

In [33]:
# For Zone 9
df = fill_categorical_values(df, 'Amenity', 'Longitude', 'Latitude')

ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by KNeighborsClassifier.

In [21]:
# sample_fraction=SAMPLE_FRACTION
# df = kriging_interpolation_subset(df, cols[0], 'Longitude', 'Latitude', sample_fraction=SAMPLE_FRACTION)

In [23]:
print("Wait..........Finishing UP")
# Reproject the merged GeoDataFrame back to geographic CRS
# df = df.to_crs(epsg=4326)

# Drop the geometry column if no longer needed
# df = df.drop(columns='geometry')

# Drop duplicates only if all values in the row are duplicates
df = df.drop_duplicates()

# Drop rows where 'Value' in column is null
df = df.dropna(how="all", subset=df.columns[:-3].to_list())

# Extract the Longitude and Latitude from the geometry column
# df['Longitude'] = df.geometry.x
# df['Latitude'] = df.geometry.y

# Move the geometry column to the last position
columns = [col for col in df.columns if col != 'geometry'] + ['geometry']
df = df[columns]

Wait..........Finishing UP


In [30]:
# Summarize NaN values for each column
nan_summary = df.isna().sum()
nan_summary

soil_moisture    0
NDBI             0
BU               0
Roughness        0
Slope            0
NDVI             0
LST              0
UHI              0
UTFVI            0
NDWI             0
SAVI             0
lulc_classes     0
Amenity          0
LandUse          0
GHI              0
CH4              0
CO               0
HCHO             0
NO2              0
O3               0
SO2              0
Longitude        0
Latitude         0
geometry         0
dtype: int64

#### Saving Data to 'CSV' and 'GPKG' Files

In [31]:
# Save to CSV file
print(f"DATA IS BEING SAVED TO .CSV FILE ........Please Wait")
df.to_csv(SAVE_FILE_CSV_V2, index=False)
print(f"DATA SAVE TO {SAVE_FILE_CSV_V2}")

# Get the file size in bytes
file_size_bytes = os.path.getsize(SAVE_FILE_CSV_V2)
# Convert the file size to megabytes
file_size_mb = file_size_bytes / (1024 * 1024)
print(f"File Size of {SAVE_FILE_CSV_V2}: {file_size_mb:.2f} MB")

DATA IS BEING SAVED TO .CSV FILE ........Please Wait
DATA SAVE TO /content/drive/MyDrive/IPAUA_Maz/dataset/zone_9_v2.csv
File Size of /content/drive/MyDrive/IPAUA_Maz/dataset/zone_9_v2.csv: 356.46 MB


In [32]:
# Save to GPKG file
print(f"DATA IS BEING SAVED TO .GPKG FILE ........Please Wait")
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]), crs="EPSG:4326")
gdf.to_file(driver='GPKG', filename=SAVE_FILE_GPKG_V2, layer=LAYER_NAME)
print(f"DATA SAVED TO {SAVE_FILE_GPKG_V2}")

# Get the file size in bytes
file_size_bytes = os.path.getsize(SAVE_FILE_GPKG_V2)
# Convert the file size to megabytes
file_size_mb = file_size_bytes / (1024 * 1024)
print(f"File Size of {SAVE_FILE_GPKG_V2}: {file_size_mb:.2f} MB")

DATA IS BEING SAVED TO .GPKG FILE ........Please Wait
DATA SAVED TO /content/drive/MyDrive/IPAUA_Maz/dataset/zone_9_v2.gpkg
File Size of /content/drive/MyDrive/IPAUA_Maz/dataset/zone_9_v2.gpkg: 283.09 MB


In [35]:
# df.head()

In [27]:
# gdf.head()