Loading the Data

In [1]:
import pandas as pd

# Load the dataset
file_path = r"data/data file 2b/data_1_1_der.csv"
data = pd.read_csv(file_path)

# Display basic information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Columns: 245 entries, thnoth_name to 8240
dtypes: float64(243), object(2)
memory usage: 1.8+ MB


In [2]:
# Display column names
column_names = data.columns.tolist()
print(column_names)

['thnoth_name', 'prov_char', '4680', '4676', '4672', '4668', '4664', '4660', '4656', '4652', '4648', '4644', '5508', '5504', '5500', '5496', '5492', '5488', '5484', '5480', '5476', '5472', '5468', '5464', '5460', '5456', '5452', '5448', '5444', '5440', '5436', '5432', '5428', '5424', '5420', '5416', '5412', '5408', '5404', '5400', '5396', '5392', '5388', '5384', '5380', '5376', '5372', '5368', '5364', '5360', '5356', '5352', '5348', '5344', '5340', '5336', '5332', '5328', '5324', '5320', '5316', '5312', '5308', '5304', '5300', '5296', '5292', '5288', '5284', '5280', '5276', '5272', '5268', '5264', '5260', '5256', '5252', '5248', '5244', '5240', '5236', '5232', '5228', '5224', '5220', '5216', '5212', '5208', '5204', '5200', '5196', '5192', '5188', '5184', '5180', '5176', '5172', '5168', '5164', '5160', '5156', '5152', '5148', '5144', '5140', '5136', '5132', '5128', '5124', '5120', '5116', '5112', '5108', '5104', '5100', '5096', '5092', '5088', '5084', '5080', '5076', '5072', '5068', '50

In [3]:
import pandas as pd
import numpy as np
from scipy.fftpack import fft

# Define regions of interest
regions_of_interest = [
    (4641, 4681), (4867, 5510), (5657, 5826), (7057, 7097),
    (7169, 7209), (8238, 8278)
]

# Function to extract features from regions of interest
def extract_features(data, regions):
    features = pd.DataFrame()
    for start, end in regions:
        # Identify columns that fall within the specified range
        region_columns = [col for col in data.columns if col.isdigit() and start <= int(col) <= end]
        if region_columns:
            region_data = data[region_columns]
            # Statistical features
            features[f'{start}_{end}_mean'] = region_data.mean(axis=1)
            features[f'{start}_{end}_std'] = region_data.std(axis=1)
            features[f'{start}_{end}_skew'] = region_data.skew(axis=1)
            features[f'{start}_{end}_kurtosis'] = region_data.kurt(axis=1)
            # Aggregated features
            features[f'{start}_{end}_sum'] = region_data.sum(axis=1)
            features[f'{start}_{end}_median'] = region_data.median(axis=1)
            # Derivative features
            features[f'{start}_{end}_first_derivative'] = region_data.diff(axis=1).mean(axis=1)
            features[f'{start}_{end}_second_derivative'] = region_data.diff(axis=1).diff(axis=1).mean(axis=1)
            # Fourier Transform features
            fft_features = fft(region_data, axis=1)
            features[f'{start}_{end}_fft_real'] = np.real(fft_features).mean(axis=1)
            features[f'{start}_{end}_fft_imag'] = np.imag(fft_features).mean(axis=1)
    return features

# Extract features
X = extract_features(data, regions_of_interest)

# Display the extracted features
X.head()


Unnamed: 0,4641_4681_mean,4641_4681_std,4641_4681_skew,4641_4681_kurtosis,4641_4681_sum,4641_4681_median,4641_4681_first_derivative,4641_4681_second_derivative,4641_4681_fft_real,4641_4681_fft_imag,...,8238_8278_mean,8238_8278_std,8238_8278_skew,8238_8278_kurtosis,8238_8278_sum,8238_8278_median,8238_8278_first_derivative,8238_8278_second_derivative,8238_8278_fft_real,8238_8278_fft_imag
0,-0.000664,0.00294,-3.158411,9.98208,-0.00664,0.000259,-0.001032,-0.001181,0.000259,-8.673617e-20,...,-0.000934,0.002773,-3.112939,9.770092,-0.009337,-0.000124,0.001005,-0.001038,-0.008792,1.0842019999999999e-20
1,-5.4e-05,0.000407,-1.036213,1.110674,-0.000539,-2.4e-05,-0.000135,-9.1e-05,0.00028,0.0,...,-0.000658,0.001937,-3.060245,9.526792,-0.006576,-0.000145,0.000714,-0.000706,-0.006123,0.0
2,0.000702,0.002824,3.102302,9.722214,0.007021,-9.9e-05,0.000952,0.001141,0.000128,1.734723e-19,...,-5.1e-05,0.000275,0.290498,-1.05502,-0.000511,-0.000138,6.4e-05,3.3e-05,-0.000196,0.0
3,0.000996,0.004076,3.128186,9.841952,0.009965,-0.000199,0.00139,0.001634,5.5e-05,0.0,...,0.000179,0.000739,2.351724,6.343585,0.001787,-4.1e-05,-0.000191,0.000327,0.002124,0.0
4,0.000911,0.003818,3.128708,9.844599,0.009114,-0.000187,0.001304,0.001531,5e-06,-1.734723e-19,...,1e-05,0.000341,0.09299,-1.463339,9.8e-05,-4.3e-05,-1.1e-05,0.000132,0.000506,-5.4210109999999996e-21


In [4]:
data

Unnamed: 0,thnoth_name,prov_char,4680,4676,4672,4668,4664,4660,4656,4652,...,8276,8272,8268,8264,8260,8256,8252,8248,8244,8240
0,Non-Thai,KAX,0.000259,0.000282,0.000293,0.000232,0.000153,0.000160,0.000259,0.000356,...,-0.008792,-0.000241,-0.000542,-0.000147,0.000269,0.000194,-0.000101,-0.000236,0.000005,0.000254
1,Non-Thai,KBX,0.000280,0.000331,0.000372,0.000278,0.000077,-0.000126,-0.000258,-0.000302,...,-0.006123,-0.000196,-0.000510,-0.000191,0.000250,0.000220,-0.000099,-0.000252,0.000024,0.000300
2,Non-Thai,BBX,0.000128,0.000159,0.000154,0.000012,-0.000210,-0.000407,-0.000519,-0.000536,...,-0.000196,-0.000143,-0.000464,-0.000232,0.000249,0.000254,-0.000132,-0.000293,0.000064,0.000383
3,Non-Thai,SUX,0.000055,0.000089,0.000081,-0.000076,-0.000323,-0.000546,-0.000668,-0.000664,...,0.002124,-0.000161,-0.000466,-0.000230,0.000262,0.000252,-0.000165,-0.000313,0.000078,0.000407
4,Non-Thai,SUX,0.000005,0.000047,0.000055,-0.000078,-0.000297,-0.000512,-0.000648,-0.000658,...,0.000506,-0.000228,-0.000492,-0.000171,0.000308,0.000224,-0.000214,-0.000324,0.000084,0.000405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,Thai,KBI,0.000176,0.000185,0.000179,0.000140,0.000060,-0.000087,-0.000252,-0.000330,...,-0.001759,-0.000263,-0.000510,-0.000133,0.000296,0.000145,-0.000253,-0.000316,0.000053,0.000334
936,Thai,KBI,0.000041,-0.000006,-0.000068,-0.000056,0.000001,0.000031,0.000013,0.000001,...,-0.002138,-0.000398,-0.000639,-0.000079,0.000252,-0.000026,-0.000318,-0.000296,-0.000039,0.000152
937,Thai,SNI,-0.000012,-0.000018,-0.000057,-0.000106,-0.000114,-0.000095,-0.000073,-0.000039,...,-0.002284,-0.000174,-0.000267,0.000044,0.000294,0.000082,-0.000342,-0.000470,-0.000207,0.000016
938,Thai,TRG,0.000018,0.000039,0.000042,-0.000003,-0.000061,-0.000132,-0.000203,-0.000236,...,-0.001338,-0.000106,-0.000385,-0.000189,0.000302,0.000252,-0.000234,-0.000348,0.000131,0.000491


In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
file_path = r"data/data file 2b/data_1_1_der.csv"
data = pd.read_csv(file_path)

# Define the target variable and feature columns
target_v00 = 'thnoth_name'
features_v00 = data.columns.difference(['thnoth_name', 'prov_char'])
X = data[features_v00]
y = data[target_v00]
groups = data['prov_char']

# Encode the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Standardize the data (mean=0, variance=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Logistic Regression model with a different solver
logreg_model = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear')

# Initialize Leave-One-Group-Out Cross-Validation
logo = LeaveOneGroupOut()

# Initialize lists to store results
test_accuracies_logreg = []
y_true_all_logreg = []
y_pred_all_logreg = []

# Apply LOGO-CV
for train_index, test_index in logo.split(X_scaled, y_encoded, groups):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # Fit the model
    logreg_model.fit(X_train, y_train)
    
    # Predict the test samples
    y_test_pred = logreg_model.predict(X_test)
    
    # Calculate and store test accuracies
    test_accuracies_logreg.append(accuracy_score(y_test, y_test_pred))
    
    # Store the prediction and actual value
    y_true_all_logreg.extend(y_test)
    y_pred_all_logreg.extend(y_test_pred)

# Flip the predictions (numerically encoded)
y_pred_lg_flipped = [1 - pred for pred in y_pred_all_logreg]

# Generate the classification report for the flipped predictions
report_logreg_flipped = classification_report(y_true_all_logreg, y_pred_lg_flipped, target_names=le.classes_, output_dict=True)
print("Leave-One-Group-Out CV Logistic Regression Model (Flipped) Classification Report")
print(classification_report(y_true_all_logreg, y_pred_lg_flipped, target_names=le.classes_))

# Generate and display the confusion matrix for the flipped predictions
cm_logreg_flipped = confusion_matrix(y_true_all_logreg, y_pred_lg_flipped)
df_cm_logreg_flipped = pd.DataFrame(cm_logreg_flipped, index=le.classes_, columns=le.classes_)
print("Confusion Matrix (Flipped):")
print(df_cm_logreg_flipped)

# Display the detailed results for the flipped predictions
for class_name, metrics in report_logreg_flipped.items():
    if isinstance(metrics, dict):
        print(f"Class: {class_name}")
        for metric_name, score in metrics.items():
            print(f"{metric_name}: {score}")
    else:
        print(f"{class_name}: {metrics}")
    print()


Leave-One-Group-Out CV Logistic Regression Model (Flipped) Classification Report
              precision    recall  f1-score   support

    Non-Thai       0.92      0.88      0.90       470
        Thai       0.89      0.93      0.91       470

    accuracy                           0.90       940
   macro avg       0.90      0.90      0.90       940
weighted avg       0.90      0.90      0.90       940

Confusion Matrix (Flipped):
          Non-Thai  Thai
Non-Thai       415    55
Thai            35   435
Class: Non-Thai
precision: 0.9222222222222223
recall: 0.8829787234042553
f1-score: 0.9021739130434783
support: 470.0

Class: Thai
precision: 0.8877551020408163
recall: 0.925531914893617
f1-score: 0.90625
support: 470.0

accuracy: 0.9042553191489362

Class: macro avg
precision: 0.9049886621315193
recall: 0.9042553191489362
f1-score: 0.9042119565217391
support: 940.0

Class: weighted avg
precision: 0.9049886621315193
recall: 0.9042553191489362
f1-score: 0.9042119565217391
support: 940.0

In [7]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the LDA Classifier
model_lda = LinearDiscriminantAnalysis()

# Initialize Leave-One-Group-Out Cross-Validation
logo = LeaveOneGroupOut()

# Initialize lists to store results
accuracies_lda = []
y_true_all_lda = []
y_pred_all_lda = []

# Apply LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # Fit the model
    model_lda.fit(X_train, y_train)
    
    # Predict the test samples
    y_test_pred = model_lda.predict(X_test)
    
    # Calculate and store the accuracy
    accuracies_lda.append(accuracy_score(y_test, y_test_pred))
    
    # Store the prediction and actual values
    y_true_all_lda.extend(y_test)
    y_pred_all_lda.extend(y_test_pred)

# Reverse the predicted classes (numerically encoded)
y_pred_all_lda_reversed = [1 - y for y in y_pred_all_lda]

# Calculate the overall accuracy for the reversed predictions
overall_accuracy_lda_reversed = accuracy_score(y_true_all_lda, y_pred_all_lda_reversed)

print(f'Leave-One-Group-Out CV - Average Test Accuracy (Reversed Classes): {overall_accuracy_lda_reversed}')
print("Leave-One-Group-Out CV LDA Model Classification Report (Reversed Classes)")
report_lda_logo_reversed = classification_report(y_true_all_lda, y_pred_all_lda_reversed, target_names=le.classes_, output_dict=True)
print(classification_report(y_true_all_lda, y_pred_all_lda_reversed, target_names=le.classes_))

# Generate and display the confusion matrix for the reversed predictions
cm_logo_lda_reversed = confusion_matrix(y_true_all_lda, y_pred_all_lda_reversed)
df_cm_lda_reversed = pd.DataFrame(cm_logo_lda_reversed, index=le.classes_, columns=le.classes_)
print("Confusion Matrix (Reversed Classes):")
print(df_cm_lda_reversed)

# Display the detailed results for the reversed predictions
for class_name, metrics in report_lda_logo_reversed.items():
    if isinstance(metrics, dict):
        print(f"Class: {class_name}")
        for metric_name, score in metrics.items():
            print(f"{metric_name}: {score}")
    else:
        print(f"{class_name}: {metrics}")
    print()


Leave-One-Group-Out CV - Average Test Accuracy (Reversed Classes): 0.8776595744680851
Leave-One-Group-Out CV LDA Model Classification Report (Reversed Classes)
              precision    recall  f1-score   support

    Non-Thai       0.90      0.85      0.87       470
        Thai       0.86      0.90      0.88       470

    accuracy                           0.88       940
   macro avg       0.88      0.88      0.88       940
weighted avg       0.88      0.88      0.88       940

Confusion Matrix (Reversed Classes):
          Non-Thai  Thai
Non-Thai       400    70
Thai            45   425
Class: Non-Thai
precision: 0.898876404494382
recall: 0.851063829787234
f1-score: 0.8743169398907104
support: 470.0

Class: Thai
precision: 0.8585858585858586
recall: 0.9042553191489362
f1-score: 0.8808290155440415
support: 470.0

accuracy: 0.8776595744680851

Class: macro avg
precision: 0.8787311315401203
recall: 0.8776595744680851
f1-score: 0.877572977717376
support: 940.0

Class: weighted avg
pre

In [8]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Standardize the features
scaler = StandardScaler()
X_scaled_svm = scaler.fit_transform(X)

# Initialize the SVM classifier
svm_model = SVC(kernel='rbf', random_state=42)

# Initialize LOGO-CV
logo = LeaveOneGroupOut()

# Initialize lists to store results
y_true_svm_all = []
y_pred_svm_all = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X_scaled_svm, y_encoded, groups):
    X_train, X_test = X_scaled_svm[train_index], X_scaled_svm[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # Fit the model
    svm_model.fit(X_train, y_train)
    
    # Predict the test samples
    y_test_pred = svm_model.predict(X_test)
    
    # Store the prediction and actual value
    y_true_svm_all.extend(y_test)
    y_pred_svm_all.extend(y_test_pred)

# Calculate the overall accuracy
overall_accuracy_svm = accuracy_score(y_true_svm_all, y_pred_svm_all)

print(f'Leave-One-Group-Out CV - Overall Test Accuracy: {overall_accuracy_svm}')

# Generate the classification report for the overall test predictions
report_svm_logo = classification_report(y_true_svm_all, y_pred_svm_all, target_names=le.classes_, output_dict=True)
print("Leave-One-Group-Out CV SVM Model Classification Report")
print(classification_report(y_true_svm_all, y_pred_svm_all, target_names=le.classes_))

# Print the confusion matrix in text format
conf_matrix_svm = confusion_matrix(y_true_svm_all, y_pred_svm_all)
conf_matrix_df_svm = pd.DataFrame(conf_matrix_svm, index=le.classes_, columns=le.classes_)
print("Confusion Matrix:")
print(conf_matrix_df_svm)

# Print the detailed results
for class_name, metrics in report_svm_logo.items():
    if isinstance(metrics, dict):
        print(f"Class: {class_name}")
        for metric_name, score in metrics.items():
            print(f"{metric_name}: {score}")
    else:
        print(f"{class_name}: {metrics}")
    print()


Leave-One-Group-Out CV - Overall Test Accuracy: 0.8414893617021276
Leave-One-Group-Out CV SVM Model Classification Report
              precision    recall  f1-score   support

    Non-Thai       0.93      0.74      0.82       470
        Thai       0.78      0.94      0.86       470

    accuracy                           0.84       940
   macro avg       0.86      0.84      0.84       940
weighted avg       0.86      0.84      0.84       940

Confusion Matrix:
          Non-Thai  Thai
Non-Thai       347   123
Thai            26   444
Class: Non-Thai
precision: 0.9302949061662198
recall: 0.7382978723404255
f1-score: 0.8232502965599051
support: 470.0

Class: Thai
precision: 0.783068783068783
recall: 0.9446808510638298
f1-score: 0.8563162970106075
support: 470.0

accuracy: 0.8414893617021276

Class: macro avg
precision: 0.8566818446175014
recall: 0.8414893617021277
f1-score: 0.8397832967852563
support: 940.0

Class: weighted avg
precision: 0.8566818446175014
recall: 0.8414893617021276
f

In [9]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE

# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Perform Recursive Feature Elimination
logreg = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear')
selector = RFE(logreg, n_features_to_select=10, step=1)
selector = selector.fit(X_poly, y_encoded)

# Get selected features
X_selected = selector.transform(X_poly)

# Standardize the selected features
scaler = StandardScaler()
X_selected_scaled = scaler.fit_transform(X_selected)

# Apply LOGO-CV with the selected features
logo = LeaveOneGroupOut()

test_accuracies_logreg = []
y_true_all_logreg = []
y_pred_all_logreg = []

for train_index, test_index in logo.split(X_selected_scaled, y_encoded, groups):
    X_train, X_test = X_selected_scaled[train_index], X_selected_scaled[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    logreg.fit(X_train, y_train)
    y_test_pred = logreg.predict(X_test)
    
    test_accuracies_logreg.append(accuracy_score(y_test, y_test_pred))
    y_true_all_logreg.extend(y_test)
    y_pred_all_logreg.extend(y_test_pred)

y_pred_lg_flipped = [1 - pred for pred in y_pred_all_logreg]

report_logreg_flipped = classification_report(y_true_all_logreg, y_pred_lg_flipped, target_names=le.classes_, output_dict=True)
print("Leave-One-Group-Out CV Logistic Regression Model (Flipped) Classification Report")
print(classification_report(y_true_all_logreg, y_pred_lg_flipped, target_names=le.classes_))

cm_logreg_flipped = confusion_matrix(y_true_all_logreg, y_pred_lg_flipped)
df_cm_logreg_flipped = pd.DataFrame(cm_logreg_flipped, index=le.classes_, columns=le.classes_)
print("Confusion Matrix (Flipped):")
print(df_cm_logreg_flipped)


Leave-One-Group-Out CV Logistic Regression Model (Flipped) Classification Report
              precision    recall  f1-score   support

    Non-Thai       0.65      0.74      0.69       470
        Thai       0.70      0.60      0.65       470

    accuracy                           0.67       940
   macro avg       0.67      0.67      0.67       940
weighted avg       0.67      0.67      0.67       940

Confusion Matrix (Flipped):
          Non-Thai  Thai
Non-Thai       348   122
Thai           188   282


In [10]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga', 'newton-cg', 'lbfgs', 'sag']
}

# Initialize Grid Search
grid_search = GridSearchCV(LogisticRegression(random_state=42, max_iter=10000), param_grid, cv=5, scoring='accuracy')

# Fit Grid Search
grid_search.fit(X_selected_scaled, y_encoded)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Apply LOGO-CV with the best parameters
best_logreg = grid_search.best_estimator_

test_accuracies_logreg = []
y_true_all_logreg = []
y_pred_all_logreg = []

for train_index, test_index in logo.split(X_selected_scaled, y_encoded, groups):
    X_train, X_test = X_selected_scaled[train_index], X_selected_scaled[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    best_logreg.fit(X_train, y_train)
    y_test_pred = best_logreg.predict(X_test)
    
    test_accuracies_logreg.append(accuracy_score(y_test, y_test_pred))
    y_true_all_logreg.extend(y_test)
    y_pred_all_logreg.extend(y_test_pred)

y_pred_lg_flipped = [1 - pred for pred in y_pred_all_logreg]

report_logreg_flipped = classification_report(y_true_all_logreg, y_pred_lg_flipped, target_names=le.classes_, output_dict=True)
print("Leave-One-Group-Out CV Logistic Regression Model (Flipped) Classification Report")
print(classification_report(y_true_all_logreg, y_pred_lg_flipped, target_names=le.classes_))

cm_logreg_flipped = confusion_matrix(y_true_all_logreg, y_pred_lg_flipped)
df_cm_logreg_flipped = pd.DataFrame(cm_logreg_flipped, index=le.classes_, columns=le.classes_)
print("Confusion Matrix (Flipped):")
print(df_cm_logreg_flipped)


Best parameters found:  {'C': 0.1, 'solver': 'saga'}
Leave-One-Group-Out CV Logistic Regression Model (Flipped) Classification Report
              precision    recall  f1-score   support

    Non-Thai       0.71      0.78      0.75       470
        Thai       0.76      0.69      0.72       470

    accuracy                           0.73       940
   macro avg       0.74      0.73      0.73       940
weighted avg       0.74      0.73      0.73       940

Confusion Matrix (Flipped):
          Non-Thai  Thai
Non-Thai       367   103
Thai           148   322
