In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.preprocessing import StandardScaler


In [2]:
def format_data_header(data : pd.DataFrame, feature_name: str):
    columns_name = [f'{feature_name}_{i}' for i in range(1, len(data.columns))]
    data.columns = columns_name + ['label']
    
    return data


# Load LBP feature dataset
lbp_data = pd.read_csv(f'./datasets/Fitur_LBPuniform_Cassava Leaf.csv')
lbp_data = format_data_header(lbp_data, 'lbp')

# Load wavelet feature dataset
wavelet_data = pd.read_csv(f'./datasets/Fitur_Wavelet_1234_Cassava Leaf.csv')
wavelet_data = format_data_header(wavelet_data, 'wavelet')

# Round label value
lbp_data['label'] = lbp_data['label'].apply(lambda x: round(x, 0))
wavelet_data['label'] = lbp_data['label'].apply(lambda x: round(x, 0))

In [3]:
# Fusion the feature
merged_data = pd.concat([lbp_data.drop('label', axis=1), wavelet_data], axis=1)
merged_data.columns

# Check data imbalance
# Count the number of samples in each class
counts = merged_data['label'].value_counts()

# Split the dataset into training and testing sets
X = merged_data.drop('label', axis=1)
y = merged_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Define the number of principal components to keep
n_components = 20

# Fit the PCA model to the training data
pca = PCA(n_components=n_components)
X_train = pca.fit_transform(X_train)

# Transform the testing data using the trained PCA model
X_test = pca.transform(X_test)

In [29]:
# Using RandomOverSampler for handling imbalance

# Print the class distribution
print('Class distribution:')
for label, count in counts.items():
    print(f'{label}: {count}')

# Create a SMOTE oversampler
over = RandomOverSampler(random_state=42)

# Fit the oversampler on the training data and oversample the minority classes
X_train_resampled, y_train_resampled = over.fit_resample(X_train, y_train)

# Train an SVM classifier on the fused feature vector
clf = SVC()
clf.fit(X_train_resampled, y_train_resampled)

# Evaluate the classifier on the test set
accuracy = clf.score(X_test, y_test)
print('Accuracy:', accuracy)

Class distribution:
3: 13158
4: 2577
2: 2386
1: 2189
0: 1086
Accuracy: 0.46869158878504674


In [13]:
# Using ADASYN for handling imbalance

# Print the class distribution
print('Class distribution:')
for label, count in counts.items():
    print(f'{label}: {count}')

# Create a SMOTE oversampler
ada = ADASYN(random_state=42)

# Fit the oversampler on the training data and oversample the minority classes
X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)

# Train an SVM classifier on the fused feature vector
clf = SVC()
clf.fit(X_train_resampled, y_train_resampled)

# Evaluate the classifier on the test set
accuracy = clf.score(X_test, y_test)
print('Accuracy:', accuracy)

Class distribution:
3: 13158
4: 2577
2: 2386
1: 2189
0: 1086
Accuracy: 0.49929906542056074


In [10]:
# Using SMOTE for handling imbalance

# Print the class distribution
print('Class distribution:')
for label, count in counts.items():
    print(f'{label}: {count}')

# Create a SMOTE oversampler
smote = SMOTE(random_state=42)

# Fit the oversampler on the training data and oversample the minority classes
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train an SVM classifier on the fused feature vector
clf = SVC()
clf.fit(X_train_resampled, y_train_resampled)

# Evaluate the classifier on the test set
accuracy = clf.score(X_test, y_test)
print('Accuracy:', accuracy)

Class distribution:
3: 13158
4: 2577
2: 2386
1: 2189
0: 1086
Accuracy: 0.4133177570093458


In [30]:
# Define the XGBoost model with desired hyperparameters
model = xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=100)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.6439252336448598


In [6]:
# Define the input shape of the model
input_shape = (X_train.shape[1],)

# Define the DNN model with desired architecture
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=input_shape),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

# Compile the model with the desired loss function, optimizer, and evaluation metrics
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert the labels to one-hot encoded format
y_train_encoded = keras.utils.to_categorical(y_train, num_classes=10)
y_test_encoded = keras.utils.to_categorical(y_test, num_classes=10)

# Train the model on the training data
model.fit(X_train, y_train_encoded, epochs=10, batch_size=32)

# Make predictions on the testing data
y_pred_encoded = model.predict(X_test)

# Convert the predicted probabilities back to class labels
y_pred = np.argmax(y_pred_encoded, axis=1)

# Evaluate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Epoch 1/10


2023-05-08 20:50:25.760424: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.6474299065420561


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Define the base classifiers to be used in the ensemble
classifiers = [
    # ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    # ('et', ExtraTreesClassifier(n_estimators=100, random_state=42)),
    # ('knn', KNeighborsClassifier(n_neighbors=5)),
    # ('mlp', MLPClassifier(hidden_layer_sizes=(64,), activation='relu', solver='adam', random_state=42)),
    # ('svm', make_pipeline(StandardScaler(), SVC(kernel='rbf', C=10, gamma=0.1)))
]

# Create the ensemble model using a VotingClassifier with 'hard' voting
model = VotingClassifier(classifiers, voting='hard')

# Train the ensemble model on the training data using cross-validation
scores = cross_val_score(model, X_train, y_train, cv=5)
print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

# Fit the ensemble model to the full training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)