In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import h5py
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [3]:
train_path = "/content/drive/MyDrive/human_settlement_mapping/train_data.h5"
test_path = "/content/drive/MyDrive/human_settlement_mapping/test_data.h5"

In [None]:

class Filtering(BaseEstimator, TransformerMixin):
    def __init__(self, features=None):
        if features is None:
            self.features = {
                'mean': np.mean,
                'median': np.median,
                'std': np.std,
                'min': np.min,
                'max': np.max,
            }
        else:
            self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        num_samples, height, width, num_channels = X.shape
        output = np.zeros((num_samples, num_channels * len(self.features)))

        for i, sample in tqdm(enumerate(X), total=num_samples, desc="Processing samples"):
            feature_vector = []
            for channel in range(num_channels):
                channel_data = sample[:, :, channel]
                for feature_name, feature_func in self.features.items():
                    feature_value = feature_func(channel_data)
                    feature_vector.append(feature_value)
            output[i] = feature_vector

        return output

##LOAD DATA

In [None]:
with h5py.File(train_path, 'r') as hdf:
    X = np.array(hdf['images'])
    y = np.array(hdf['labels'])

num_ones = np.sum(y == 1)
ones_indices = np.where(y == 1)[0]
zeros_indices = np.where(y == 0)[0]

balanced_zero_indices = np.random.choice(zeros_indices, num_ones, replace=False)
balanced_indices = np.concatenate([ones_indices, balanced_zero_indices])

X_balanced = X[balanced_indices]
y_balanced = y[balanced_indices]


##Apply Filtering

In [None]:
scf = Filtering()
arr = scf.transform(X_balanced)

col = [f"{feature}_{i}" for i in range(X.shape[3]) for feature in scf.features.keys()]
data = pd.DataFrame(arr, columns=col)
data["Target"] = y_balanced


##LOAD TEST DATA

In [7]:
with h5py.File(test_path, 'r') as hdf:
    test_x = np.array(hdf['images'])

arr_test = scf.transform(test_x)
test_data = pd.DataFrame(arr_test, columns=col)
test_data.head()

##SPLIT DATA FOR TRAINING

In [13]:
data = data.sample(frac=1).reset_index(drop=True) #shuffle data

X_train = data.drop("Target", axis=1)
y_train = data["Target"]

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

##TRAIN MODEL

In [22]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

##TEST MODEL

In [25]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

train_accuracy_rf = accuracy_score(y_train, y_pred_train)
print(f'Baseline RF model training accuracy: {train_accuracy_rf:.4f}')


Baseline RF model training accuracy: 1.0000


##PREDICT ON TEST DATA

In [26]:
prediction = pd.DataFrame(clf.predict(test_data))
prediction.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


##SUBMISSION

In [27]:
sub = pd.read_csv("/content/drive/MyDrive/human_settlement_mapping/SampleSubmission.csv")
sub["class"] = prediction
sub.to_csv("first_submission_4.csv", index=False)
sub.head()

Unnamed: 0,id,class
0,id_f5l6neabz0,0
1,id_7u688i5j5v,0
2,id_wm2bkde80r,0
3,id_0e6spuo284,0
4,id_dm71xazj94,0
