In [6]:
import pandas as pd
import os

root_dir = r"Z:\Shared drives\TREEO BD Supply\satellite_verification\2025_notebooks_Satver\20250205_Koop_Bali\arcgis\obia_segment_shift\data_analysis.gdb"  # Change to your workspace

# do this outside arcgis pro - i dont know why arcgis resource have the error here
df_training_fix_labeled = pd.read_csv(os.path.join(os.path.dirname(root_dir), 'training_ml_lu.csv'))

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Example: Random Forest
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV  # For hyperparameter tuning

# 1. Separate features (X) and labels (y)
X = df_training_fix_labeled.drop('code_lu', axis=1)  # Features (all columns except 'code_lu')
y = df_training_fix_labeled['code_lu']  # Labels ('code_lu' column)

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% train, 20% test

# 3. Scale the features (important for many models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)  # Use the same scaler fitted on the training data

# 4. Choose a model (Random Forest is a good starting point)
model = RandomForestClassifier(random_state=42, class_weight='balanced')

# 5. Hyperparameter Tuning (GridSearchCV) - Optional but highly recommended
param_grid = {
    'n_estimators': [50, 100, 200],  # Test different numbers of trees
    'max_depth': [None, 10, 20],  # Test different tree depths
    'min_samples_split': [2, 5, 10],  # Test different minimum samples per split
    'min_samples_leaf': [1, 2, 4]  # Test different minimum samples per leaf
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)  # cv=3 for 3-fold cross-validation, n_jobs=-1 to use all cores
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_  # Get the best model from grid search

# 6. Train the model (using the best hyperparameters from GridSearch)
best_model.fit(X_train, y_train)

# 7. Make predictions on the test set
y_pred = best_model.predict(X_test)

# 8. Evaluate the model
print(classification_report(y_test, y_pred))  # Print classification report (precision, recall, F1-score)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# 9. (Optional) Save the trained model
import joblib
model_filename = "trained_model.pkl"  # Or any name you prefer
joblib.dump(best_model, model_filename)
print(f"Model saved to: {model_filename}")



              precision    recall  f1-score   support

         1.0       0.80      0.80      0.80         5
         2.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         1
         7.0       0.50      1.00      0.67         3
         9.0       0.80      0.67      0.73         6

    accuracy                           0.69        16
   macro avg       0.42      0.49      0.44        16
weighted avg       0.64      0.69      0.65        16

Accuracy: 0.6875
Model saved to: trained_model.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

y = np.array([1.0, 2.0, 3.0, 4.0, 7.0, 9.0])

# Step 1: Create a mapping from original labels to normalized labels
unique_classes = np.unique(y)
label_mapping = {original: normalized for normalized, original in enumerate(unique_classes)}
print("Label Mapping:", label_mapping)

# Step 2: Transform the target variable `y` using the mapping
y_normalized = np.array([label_mapping[val] for val in y])
print("Original y:", y)
print("Normalized y:", y_normalized)

# Step 3: Store the mapping for later use
# You can save this dictionary to a file or use it in your code
# Example: Save to a JSON file
import json
with open("label_mapping.json", "w") as f:
    json.dump(label_mapping, f)

# Step 4: Reverse the transformation (if needed)
# Load the mapping (if saved to a file)
with open("label_mapping.json", "r") as f:
    loaded_mapping = json.load(f)

# Reverse the normalization
y_original = np.array([list(loaded_mapping.keys())[list(loaded_mapping.values()).index(val)] for val in y_normalized])
print("Reversed y:", y_original)

Label Mapping: {np.float64(1.0): 0, np.float64(2.0): 1, np.float64(3.0): 2, np.float64(4.0): 3, np.float64(7.0): 4, np.float64(9.0): 5}
Original y: [1. 2. 3. 4. 7. 9.]
Normalized y: [0 1 2 3 4 5]
Reversed y: ['1.0' '2.0' '3.0' '4.0' '7.0' '9.0']


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_sample_weight

# Example dataset
# Assuming df_training_fix_labeled is your DataFrame
X = df_training_fix_labeled.drop('code_lu', axis=1)  # Features
y = df_training_fix_labeled['code_lu']  # Original labels

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Encode y_train and y_test
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)  # Fit on y_train and transform y_train
y_test_encoded = encoder.transform(y_test)  # Transform y_test using the same encoder
class_weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)

# Train the XGBoost model with class weights
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train_encoded, sample_weight=class_weights)

# Step 4: Make predictions
y_pred_encoded = model.predict(X_test)

# Step 5: Reverse the encoding for evaluation (if needed)
y_pred_original = encoder.inverse_transform(y_pred_encoded)

# Step 6: Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred_original))  # Use original labels for evaluation
print(f"Accuracy: {accuracy_score(y_test_encoded, y_pred_encoded)}")  # Use encoded labels for accuracy

Classification Report:
              precision    recall  f1-score   support

         1.0       0.80      0.80      0.80         5
         2.0       0.00      0.00      0.00         1
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         1
         7.0       0.50      1.00      0.67         3
         9.0       0.75      0.50      0.60         6

    accuracy                           0.62        16
   macro avg       0.34      0.38      0.34        16
weighted avg       0.62      0.62      0.60        16

Accuracy: 0.625


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier  # Import DecisionTreeClassifier (CART)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 2: Encode y_train and y_test
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)  # Fit on y_train and transform y_train
y_test_encoded = encoder.transform(y_test)  # Transform y_test using the same encoder
class_weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)

# 3. Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 4. Use Decision Tree Classifier (CART)
model = DecisionTreeClassifier(random_state=42)

# 5. Hyperparameter Tuning (GridSearchCV)
param_grid = {
    'criterion': ['gini', 'entropy'],  # Test different split criteria
    'max_depth': [None, 5, 10, 15],  # Test different tree depths
    'min_samples_split': [2, 5, 10],  # Test different minimum samples per split
    'min_samples_leaf': [1, 2, 4]  # Test different minimum samples per leaf
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train_encoded)

best_model = grid_search.best_estimator_

# 6. Train the model
best_model.fit(X_train, y_train_encoded)

# 7. Make predictions
y_pred_encoded = best_model.predict(X_test)

# Step 5: Reverse the encoding for evaluation (if needed)
y_pred_original = encoder.inverse_transform(y_pred_encoded)


# Step 6: Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred_original))  # Use original labels for evaluation
print(f"Accuracy: {accuracy_score(y_test_encoded, y_pred_encoded)}")  # Use encoded labels for accuracy

#... (Optional: Save the model)



Classification Report:
              precision    recall  f1-score   support

         1.0       0.80      0.80      0.80         5
         2.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         1
         7.0       0.50      1.00      0.67         3
         9.0       1.00      0.50      0.67         6

    accuracy                           0.62        16
   macro avg       0.46      0.46      0.43        16
weighted avg       0.72      0.62      0.62        16

Accuracy: 0.625


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

# !pip install imbalanced-from collections import Counter
from collections import Counter

class_counts = Counter(y)
print(class_counts)

# 2. Encode the labels (y)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 4. Identify minority classes in the training set
class_counts = Counter(y_train)  # Count occurrences in the training set
minority_classes = [key for key, value in class_counts.items() if value < 10]  # Define minority classes (adjust threshold as needed)

# # 5. Apply SMOTE to oversample minority classes in the training set
# smote = SMOTE(random_state=42, k_neighbors=1)  # Use k_neighbors=2 to avoid the previous error
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 6. Scale the features (use the resampled training set)
scaler = StandardScaler()
# X_train_resampled = scaler.fit_transform(X_train_resampled)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 7. Use SVM Classifier (SVC)
model = SVC(random_state=42)

# 6. Hyperparameter Tuning (GridSearchCV)
param_grid = {
    'C': [0.1, 1, 10, 100],  # Expanded range for C
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Added sigmoid kernel
    'gamma': ['scale', 'auto', 0.1, 1, 10]  # Expanded range for gamma
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
# grid_search.fit(X_train_resampled, y_train_resampled)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# 9. Train the model (using the best model from GridSearchCV and resampled data)
# best_model.fit(X_train_resampled, y_train_resampled)
best_model.fit(X_train, y_train)

# 10. Make predictions
y_pred_encoded = best_model.predict(X_test)

# 11. Evaluate the model
print(classification_report(y_test, y_pred_encoded))
accuracy = accuracy_score(y_test, y_pred_encoded)
print(f"Accuracy: {accuracy}")

# 12. (Optional) Decode predictions
y_pred_original = encoder.inverse_transform(y_pred_encoded)

Counter({1.0: 29, 9.0: 27, 7.0: 11, 2.0: 4, 3.0: 3, 4.0: 3})
              precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.60      1.00      0.75         3
           5       0.80      0.67      0.73         6

    accuracy                           0.75        16
   macro avg       0.45      0.53      0.48        16
weighted avg       0.67      0.75      0.70        16

Accuracy: 0.75


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
from sklearn.metrics import classification_report, accuracy_score

# 2. Encode the labels (y)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 4. Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 5. Define the deep learning model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # Correct input_shape
    keras.layers.Dense(64, activation='relu'),  # Hidden layer
    keras.layers.Dense(len(np.unique(y_encoded)), activation='softmax')  # Output layer
])

# 6. Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 7. Train the model
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2)  # Add validation_split

# 8. Make predictions
y_pred_encoded = np.argmax(model.predict(X_test), axis=1)  # Use argmax for multi-class

# 9. Evaluate the model
print(classification_report(y_test, y_pred_encoded))
accuracy = accuracy_score(y_test, y_pred_encoded)
print(f"Accuracy: {accuracy}")

# 10. (Optional) Decode predictions
y_pred_original = encoder.inverse_transform(y_pred_encoded)

Epoch 1/200
Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x000001A26FC01DA0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x000001A26FC01DA0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Cause: Unable to locate the source code of <function

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [9]:
tf.test.is_built_with_cuda()

False

In [12]:
# !pip install tensorflow

In [7]:
# !pip install xgboost