In [None]:
!pip install pykrige

Collecting pykrige
  Downloading PyKrige-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (909 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m909.7/909.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pykrige
Successfully installed pykrige-1.7.2


In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3


In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold,KFold,cross_val_score,learning_curve

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score,recall_score,precision_score, confusion_matrix, classification_report,roc_curve,auc

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from pykrige.ok import OrdinaryKriging

In [None]:
#loading zone4 2014 data
df_zone4_2014 = pd.read_csv('/content/Final_combined_data_zone4_with_target.csv')

In [None]:
df_zone4_2014.head()

Unnamed: 0,Latitude,Longitude,Zone,NDVI,landuse,LST,NDBI,NDWI,Roughness,SAVI,Slope,SMI,solar_radiation,Suitable_Areas
0,45.47236,9.202701,zone4,,,121.681648,,,0,,0.0,,,0
1,45.47236,9.202971,zone4,,,121.681648,,,0,,0.0,,,0
2,45.47236,9.20324,zone4,,,121.681648,,,0,,0.0,,,0
3,45.47236,9.20351,zone4,,,121.681648,,,0,,0.0,,,0
4,45.47236,9.203779,zone4,,,121.681648,,,0,,0.0,,,0


In [None]:
df_zone4_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52076 entries, 0 to 52075
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Latitude         52076 non-null  float64
 1   Longitude        52076 non-null  float64
 2   Zone             52076 non-null  object 
 3   NDVI             33199 non-null  float64
 4   landuse          12728 non-null  object 
 5   LST              52076 non-null  float64
 6   NDBI             35676 non-null  float64
 7   NDWI             33199 non-null  float64
 8   Roughness        52076 non-null  int64  
 9   SAVI             33199 non-null  float64
 10  Slope            52076 non-null  float64
 11  SMI              33199 non-null  float64
 12  solar_radiation  33199 non-null  float64
 13  Suitable_Areas   52076 non-null  int64  
dtypes: float64(10), int64(2), object(2)
memory usage: 5.6+ MB


In [None]:
df_zone4_2014.isnull().sum()

Latitude               0
Longitude              0
Zone                   0
NDVI               18877
landuse            39348
LST                    0
NDBI               16400
NDWI               18877
Roughness              0
SAVI               18877
Slope                  0
SMI                18877
solar_radiation    18877
Suitable_Areas         0
dtype: int64

There are missing values in NDVI,landuse,NDBI,NDWI,SAVI,SMI,solar radiation columns

 NDVI,NDBI,NDWI,SAVI,SMI,solar radiation - numerical columns

 landuse - categorical column

 Kriging Interpoaltion is used for filling null values in numerical columns,
  filling is done spliting data as chunk to  manage memory usage effectively

 NearestNeighbors is used for filling null values in categorical columns

In [None]:


# Function to perform Kriging interpolation taking data as a chunk
def kriging_interpolation(chunk, column):
    # Preparing the data for Kriging
    coordinates = chunk[['Longitude', 'Latitude']].values
    values = chunk[column].values

    # Filtering out missing values
    known_coordinates = coordinates[~np.isnan(values)]
    known_values = values[~np.isnan(values)]

    if known_coordinates.size == 0 or known_values.size == 0:
        return chunk  # Return the chunk as is if there are no known values

    # Initializing OrdinaryKriging
    ok = OrdinaryKriging(
        known_coordinates[:, 0],
        known_coordinates[:, 1],
        known_values,
        variogram_model='linear',
        verbose=False,
    )

    # Interpolating missing values
    missing_indices = np.isnan(values)
    missing_coordinates = coordinates[missing_indices]
    z, _ = ok.execute('points', missing_coordinates[:, 0], missing_coordinates[:, 1])

    # Updating DataFrame with interpolated values
    chunk.loc[missing_indices, column] = z

    return chunk

# Columns with missing values
columns_with_missing_values = ['NDVI', 'NDBI', 'NDWI', 'SAVI', 'SMI', 'solar_radiation']

# Number of chunks
n_chunks = 10

# Dividing the DataFrame into chunks
chunks = np.array_split(df_zone4_2014, n_chunks)

# Looping through each column and filling missing values
for column in columns_with_missing_values:
    filled_chunks = []
    for chunk in chunks:
        filled_chunk = kriging_interpolation(chunk, column)
        filled_chunks.append(filled_chunk)

    # Combining the filled chunks back into a single DataFrame
    df_zone4_2014 = pd.concat(filled_chunks).reset_index(drop=True)

# Checking for any remaining missing values
print(df_zone4_2014.isnull().sum())


Latitude               0
Longitude              0
Zone                   0
NDVI                   0
landuse            39348
LST                    0
NDBI                   0
NDWI                   0
Roughness              0
SAVI                   0
Slope                  0
SMI                    0
solar_radiation        0
Suitable_Areas         0
dtype: int64


In [None]:
df_zone4_2014.isnull().sum()

Latitude               0
Longitude              0
Zone                   0
NDVI                   0
landuse            39211
LST                    0
NDBI                   0
NDWI                   0
Roughness              0
SAVI                   0
Slope                  0
SMI                    0
solar_radiation        0
Suitable_Areas         0
dtype: int64

In [None]:
df_zone4_2014['landuse'].unique()

array([nan, 'grass', 'military', 'industrial', 'farmland', 'residential',
       'allotments', 'farmyard', 'meadow', 'forest', 'construction',
       'commercial', 'village_green', 'railway', 'retail'], dtype=object)

In [None]:
from sklearn.neighbors import NearestNeighbors
# Filling missing landuse values using nearest neighbors
def fill_missing_landuse(df_zone4_2014, default_k=5):
    # Preparing coordinates and mask for missing values
    coordinates = df_zone4_2014[['Longitude', 'Latitude']].values
    landuse = df_zone4_2014['landuse'].values
    missing_mask = pd.isnull(landuse)

    # Initializing Nearest Neighbors model
    known_coordinates = coordinates[~missing_mask]
    known_landuse = landuse[~missing_mask]
    n_known_samples = len(known_coordinates)

    if n_known_samples == 0:
        return df_zone4_2014

    # Adjusting k if there are fewer known samples than the default k
    k = min(default_k, n_known_samples)

    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(known_coordinates)

    for idx in np.where(missing_mask)[0]:
        # Finding the nearest neighbors for the missing value
        _, indices = nbrs.kneighbors([coordinates[idx]])

        # Extracting the landuse values of the neighbors
        neighbor_landuse = known_landuse[indices[0]]

        # Filling the missing value with the mode of the neighbors
        if len(neighbor_landuse) > 0:
            df_zone4_2014.loc[idx, 'landuse'] = pd.Series(neighbor_landuse).mode()[0]

    return df_zone4_2014

# Filling missing landuse values
df_filled = fill_missing_landuse(df_zone4_2014)

# Checking for any remaining missing values in landuse
print(df_filled['landuse'].isnull().sum())

# Displaying filled DataFrame
print(df_filled)

# Ensuring other columns are not affected
print(df_filled.isnull().sum())

0
        Latitude  Longitude   Zone      NDVI landuse         LST      NDBI  \
0      45.472360   9.202701  zone4  0.360761   grass  121.681648 -0.130789   
1      45.472360   9.202971  zone4  0.361043   grass  121.681648 -0.130598   
2      45.472360   9.203240  zone4  0.361316   grass  121.681648 -0.130393   
3      45.472360   9.203510  zone4  0.361578   grass  121.681648 -0.130170   
4      45.472360   9.203779  zone4  0.361827   grass  121.681648 -0.129930   
...          ...        ...    ...       ...     ...         ...       ...   
52071  45.420078   9.271692  zone4  0.420701   grass  121.681648 -0.121223   
52072  45.420078   9.271961  zone4  0.420451   grass  121.681648 -0.121055   
52073  45.420078   9.272231  zone4  0.420207   grass  121.681648 -0.120892   
52074  45.420078   9.272500  zone4  0.419969   grass  121.681648 -0.120733   
52075  45.420078   9.272770  zone4  0.419738   grass  121.681648 -0.120577   

           NDWI  Roughness      SAVI  Slope       SMI  solar_

In [None]:
df_filled.isnull().sum()

Latitude           0
Longitude          0
Zone               0
NDVI               0
landuse            0
LST                0
NDBI               0
NDWI               0
Roughness          0
SAVI               0
Slope              0
SMI                0
solar_radiation    0
Suitable_Areas     0
dtype: int64

In [None]:
#saving the filled file as csv
df_filled.to_csv('filled_data_zone4_2014.csv', index=False)

In [None]:
df_zone4=pd.read_csv('/content/filled_data_zone4_2014.csv')

In [None]:
df_zone4.head()

Unnamed: 0,Latitude,Longitude,Zone,NDVI,landuse,LST,NDBI,NDWI,Roughness,SAVI,Slope,SMI,solar_radiation,Suitable_Areas
0,45.47236,9.202701,zone4,0.360761,grass,121.681648,-0.130789,-0.381272,0,0.541073,0.0,0.214375,467.293863,0
1,45.47236,9.202971,zone4,0.361043,grass,121.681648,-0.130598,-0.381346,0,0.541497,0.0,0.215051,467.363286,0
2,45.47236,9.20324,zone4,0.361316,grass,121.681648,-0.130393,-0.381412,0,0.541907,0.0,0.215769,467.438801,0
3,45.47236,9.20351,zone4,0.361578,grass,121.681648,-0.13017,-0.381469,0,0.542301,0.0,0.216531,467.520987,0
4,45.47236,9.203779,zone4,0.361827,grass,121.681648,-0.12993,-0.381516,0,0.542676,0.0,0.217339,467.610465,0


In [None]:
df_zone4.isnull().sum()

Latitude           0
Longitude          0
Zone               0
NDVI               0
landuse            0
LST                0
NDBI               0
NDWI               0
Roughness          0
SAVI               0
Slope              0
SMI                0
solar_radiation    0
Suitable_Areas     0
dtype: int64

In [None]:
#loading the zone9 2014 data
df_zone9_2014 = pd.read_csv('/content/Final_combined_data_zone9_with_target.csv')

In [None]:
df_zone9_2014.head()

Unnamed: 0,Latitude,Longitude,Zone,NDVI,landuse,LST,NDBI,NDWI,Roughness,SAVI,Slope,SMI,solar_radiation,Suitable_Areas
0,45.5365,9.143143,zone9,,,,,,0,,0.0,,,0
1,45.5365,9.143412,zone9,,,,,,0,,0.0,,,0
2,45.5365,9.143682,zone9,,,,,,0,,0.0,,,0
3,45.5365,9.143951,zone9,,,,,,0,,0.0,,,0
4,45.5365,9.144221,zone9,,,,,,0,,0.0,,,0


In [None]:
df_zone9_2014.isnull().sum()

Latitude               0
Longitude              0
Zone                   0
NDVI               28160
landuse            46662
LST                62370
NDBI               25289
NDWI               28160
Roughness              0
SAVI               28160
Slope                  0
SMI                28160
solar_radiation    28160
Suitable_Areas         0
dtype: int64

There are missing values in NDVI,landuse,LST,NDBI,NDWI,SAVI,SMI,solar radiation columns

 NDVI,LST,NDBI,NDWI,SAVI,SMI,solar radiation - numerical columns

 landuse - categorical column

 Kriging Interpoaltion is used for filling null values in numerical columns,
  filling is done spliting data as chunk to  manage memory usage effectively

 NearestNeighbors is used for filling null values in categorical columns

In [None]:
df_zone9_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62370 entries, 0 to 62369
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Latitude         62370 non-null  float64
 1   Longitude        62370 non-null  float64
 2   Zone             62370 non-null  object 
 3   NDVI             34210 non-null  float64
 4   landuse          15708 non-null  object 
 5   LST              0 non-null      float64
 6   NDBI             37081 non-null  float64
 7   NDWI             34210 non-null  float64
 8   Roughness        62370 non-null  int64  
 9   SAVI             34210 non-null  float64
 10  Slope            62370 non-null  float64
 11  SMI              34210 non-null  float64
 12  solar_radiation  34210 non-null  float64
 13  Suitable_Areas   62370 non-null  int64  
dtypes: float64(10), int64(2), object(2)
memory usage: 6.7+ MB


In [None]:


# Function to perform Kriging interpolation
def kriging_interpolation(chunk, column):
    # Preparing the data for Kriging
    coordinates = chunk[['Longitude', 'Latitude']].values
    values = chunk[column].values

    # Filtering out missing values
    known_coordinates = coordinates[~np.isnan(values)]
    known_values = values[~np.isnan(values)]

    if known_coordinates.size == 0 or known_values.size == 0:
        return chunk  # Return the chunk as is if there are no known values

    # Initialize OrdinaryKriging
    ok = OrdinaryKriging(
        known_coordinates[:, 0],
        known_coordinates[:, 1],
        known_values,
        variogram_model='linear',
        verbose=False,
    )

    # Interpolating missing values
    missing_indices = np.isnan(values)
    missing_coordinates = coordinates[missing_indices]
    z, _ = ok.execute('points', missing_coordinates[:, 0], missing_coordinates[:, 1])

    # Updating DataFrame with interpolated values
    chunk.loc[missing_indices, column] = z

    return chunk

# Columns with missing values
columns_with_missing_values = ['NDVI', 'NDBI', 'NDWI', 'SAVI', 'SMI', 'solar_radiation']

# Number of chunks
n_chunks = 10

# Dividing the DataFrame into chunks
chunks = np.array_split(df_zone9_2014, n_chunks)

# Looping through each column and filling missing values
for column in columns_with_missing_values:
    filled_chunks = []
    for chunk in chunks:
        filled_chunk = kriging_interpolation(chunk, column)
        filled_chunks.append(filled_chunk)

    # Combining the filled chunks back into a single DataFrame
    df_zone9_2014= pd.concat(filled_chunks).reset_index(drop=True)

# Checking for any remaining missing values
print(df_zone9_2014.isnull().sum())


Latitude               0
Longitude              0
Zone                   0
NDVI                   0
landuse            46662
LST                62370
NDBI                   0
NDWI                   0
Roughness              0
SAVI                   0
Slope                  0
SMI                    0
solar_radiation        0
Suitable_Areas         0
dtype: int64


In [None]:
from sklearn.neighbors import NearestNeighbors
# Filling the missing landuse values using nearest neighbors
def fill_missing_landuse(df_zone9_2014, default_k=5):
    # Preparing coordinates and mask for missing values
    coordinates = df_zone9_2014[['Longitude', 'Latitude']].values
    landuse = df_zone9_2014['landuse'].values
    missing_mask = pd.isnull(landuse)

    # Initialize Nearest Neighbors model
    known_coordinates = coordinates[~missing_mask]
    known_landuse = landuse[~missing_mask]
    n_known_samples = len(known_coordinates)

    if n_known_samples == 0:
        return df_zone9_2014

    # Adjusting k if there are fewer known samples than the default k
    k = min(default_k, n_known_samples)

    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(known_coordinates)

    for idx in np.where(missing_mask)[0]:
        # Find the nearest neighbors for the missing value
        _, indices = nbrs.kneighbors([coordinates[idx]])

        # Extracting the landuse values of the neighbors
        neighbor_landuse = known_landuse[indices[0]]

        # Filling the missing value with the mode of the neighbors
        if len(neighbor_landuse) > 0:
            df_zone9_2014.loc[idx, 'landuse'] = pd.Series(neighbor_landuse).mode()[0]

    return df_zone9_2014

# Filling missing landuse values
df_filled_zone9= fill_missing_landuse(df_zone9_2014)

# Checking for any remaining missing values in landuse
print(df_filled_zone9['landuse'].isnull().sum())

# Displaying filled DataFrame
print(df_filled_zone9)

# Ensuring other columns are not affected
print(df_filled_zone9.isnull().sum())

0
        Latitude  Longitude   Zone      NDVI landuse  LST      NDBI      NDWI  \
0      45.536500   9.143143  zone9  0.199839  meadow  NaN  0.012082 -0.228503   
1      45.536500   9.143412  zone9  0.199015  meadow  NaN  0.012169 -0.227788   
2      45.536500   9.143682  zone9  0.198166  meadow  NaN  0.012258 -0.227050   
3      45.536500   9.143951  zone9  0.197290  meadow  NaN  0.012349 -0.226290   
4      45.536500   9.144221  zone9  0.196386  meadow  NaN  0.012442 -0.225505   
...          ...        ...    ...       ...     ...  ...       ...       ...   
62365  45.480445   9.219679  zone9  0.186945   grass  NaN -0.000255 -0.221953   
62366  45.480445   9.219949  zone9  0.186945   grass  NaN -0.000262 -0.221953   
62367  45.480445   9.220218  zone9  0.186945   grass  NaN -0.000270 -0.221953   
62368  45.480445   9.220488  zone9  0.186945   grass  NaN -0.000278 -0.221953   
62369  45.480445   9.220757  zone9  0.186945   grass  NaN -0.000286 -0.221953   

       Roughness      SAV

In [None]:
#saving the zone9 filled data to csv
df_filled_zone9.to_csv('filled_data_zone9_2014.csv', index=False)

In [None]:
df_zone9=pd.read_csv('/content/filled_data_zone9_2014.csv')

In [None]:
df_zone9.head()

Unnamed: 0,Latitude,Longitude,Zone,NDVI,landuse,LST,NDBI,NDWI,Roughness,SAVI,Slope,SMI,solar_radiation,Suitable_Areas
0,45.5365,9.143143,zone9,0.199839,meadow,,0.012082,-0.228503,0,0.299694,0.0,0.324271,471.151147,0
1,45.5365,9.143412,zone9,0.199015,meadow,,0.012169,-0.227788,0,0.298458,0.0,0.324271,471.140292,0
2,45.5365,9.143682,zone9,0.198166,meadow,,0.012258,-0.22705,0,0.297184,0.0,0.324271,471.129071,0
3,45.5365,9.143951,zone9,0.19729,meadow,,0.012349,-0.22629,0,0.295871,0.0,0.324271,471.117467,0
4,45.5365,9.144221,zone9,0.196386,meadow,,0.012442,-0.225505,0,0.294517,0.0,0.324271,471.105458,0


In [None]:
df_zone9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62370 entries, 0 to 62369
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Latitude         62370 non-null  float64
 1   Longitude        62370 non-null  float64
 2   Zone             62370 non-null  object 
 3   NDVI             62370 non-null  float64
 4   landuse          62370 non-null  object 
 5   LST              0 non-null      float64
 6   NDBI             62370 non-null  float64
 7   NDWI             62370 non-null  float64
 8   Roughness        62370 non-null  int64  
 9   SAVI             62370 non-null  float64
 10  Slope            62370 non-null  float64
 11  SMI              62370 non-null  float64
 12  solar_radiation  62370 non-null  float64
 13  Suitable_Areas   62370 non-null  int64  
dtypes: float64(10), int64(2), object(2)
memory usage: 6.7+ MB


In [None]:
#concatinating zone4 and zone9 filled data
df_2014=pd.concat([df_zone4,df_zone9])

In [None]:
df_2014.head()

Unnamed: 0,Latitude,Longitude,Zone,NDVI,landuse,LST,NDBI,NDWI,Roughness,SAVI,Slope,SMI,solar_radiation,Suitable_Areas
0,45.47236,9.202701,zone4,0.360761,grass,121.681648,-0.130789,-0.381272,0,0.541073,0.0,0.214375,467.293863,0
1,45.47236,9.202971,zone4,0.361043,grass,121.681648,-0.130598,-0.381346,0,0.541497,0.0,0.215051,467.363286,0
2,45.47236,9.20324,zone4,0.361316,grass,121.681648,-0.130393,-0.381412,0,0.541907,0.0,0.215769,467.438801,0
3,45.47236,9.20351,zone4,0.361578,grass,121.681648,-0.13017,-0.381469,0,0.542301,0.0,0.216531,467.520987,0
4,45.47236,9.203779,zone4,0.361827,grass,121.681648,-0.12993,-0.381516,0,0.542676,0.0,0.217339,467.610465,0


In [None]:
df_2014.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114446 entries, 0 to 62369
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Latitude         114446 non-null  float64
 1   Longitude        114446 non-null  float64
 2   Zone             114446 non-null  object 
 3   NDVI             114446 non-null  float64
 4   landuse          114446 non-null  object 
 5   LST              52076 non-null   float64
 6   NDBI             114446 non-null  float64
 7   NDWI             114446 non-null  float64
 8   Roughness        114446 non-null  int64  
 9   SAVI             114446 non-null  float64
 10  Slope            114446 non-null  float64
 11  SMI              114446 non-null  float64
 12  solar_radiation  114446 non-null  float64
 13  Suitable_Areas   114446 non-null  int64  
dtypes: float64(10), int64(2), object(2)
memory usage: 13.1+ MB


In [None]:
df_2014.isnull().sum()

Latitude               0
Longitude              0
Zone                   0
NDVI                   0
landuse                0
LST                62370
NDBI                   0
NDWI                   0
Roughness              0
SAVI                   0
Slope                  0
SMI                    0
solar_radiation        0
Suitable_Areas         0
dtype: int64

In [None]:
df_2014.drop('LST',axis=1,inplace=True)

In [None]:
df_2014.isnull().sum()

Latitude           0
Longitude          0
Zone               0
NDVI               0
landuse            0
NDBI               0
NDWI               0
Roughness          0
SAVI               0
Slope              0
SMI                0
solar_radiation    0
Suitable_Areas     0
dtype: int64