In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# 3 Train Model 1: Stacking Method

In [2]:
df = pd.read_csv('Section1&2(Clusters).csv')
df.head() # presenting length of train data 

In [3]:
X = df.iloc[:,:-3]
y = df.iloc[:,-2]
X_2 = X[df.iloc[:,-1] == 2] 
y_2 = y[df.iloc[:,-1] == 2]

In [4]:
# Set up the hyperparameter grid
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2']               # Types of penalty
}

# Create GridSearchCV object
grid_lr = GridSearchCV(LogisticRegression(solver='liblinear', random_state=42), 
                       param_grid_lr, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_lr.fit(X_2, y_2)
print("Best parameters for Logistic Regression:", grid_lr.best_params_)
print("Best cross-validated score:", grid_lr.best_score_)




Best parameters for Logistic Regression: {'C': 0.001, 'penalty': 'l1'}
Best cross-validated score: 0.9976284584980238


In [5]:
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 'scale'],  # Kernel coefficient
    'kernel': ['rbf', 'poly', 'sigmoid']      # Type of kernel
}

grid_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5, scoring='accuracy')
grid_svm.fit(X_2, y_2)
print("Best parameters for SVM:", grid_svm.best_params_)
print("Best cross-validated score:", grid_svm.best_score_)




Best parameters for SVM: {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}
Best cross-validated score: 0.9976284584980238


In [6]:
# Set up models with the best parameters found from GridSearchCV
models = {
    'Logistic': LogisticRegression(solver='liblinear', C=0.001, penalty='l1', random_state=42),
    'SVM': SVC(C=0.1, gamma=0.001, kernel='rbf', random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42) 
}

# Fit models
model_2_1 = models['Logistic'].fit(X_2, y_2)
model_2_2 = models['SVM'].fit(X_2, y_2)
model_2_3 = models['RandomForest'].fit(X_2, y_2)

# Predict
LR_pred_2 = model_2_1.predict(X_2)
SVM_pred_2 = model_2_2.predict(X_2)
RF_pred_2 = model_2_3.predict(X_2)

# Confusion matrices
print("Logistic Regression confusion matrix:\n", confusion_matrix(y_2, LR_pred_2))
print("SVM confusion matrix:\n", confusion_matrix(y_2, SVM_pred_2))
print("RandomForest confusion matrix:\n", confusion_matrix(y_2, RF_pred_2))

Logistic Regression confusion matrix:
 [[1254    8]
 [   3    0]]
SVM confusion matrix:
 [[1262    0]
 [   3    0]]
RandomForest confusion matrix:
 [[1262    0]
 [   0    3]]


In [7]:
X_Stack_2 = pd.DataFrame()
X_Stack_2['RF'] = RF_pred_2
X_Stack_2['SVM'] = SVM_pred_2
X_Stack_2['LR'] = LR_pred_2

meta_learner = LogisticRegression(class_weight='balanced', random_state=42)
meta_learner.fit(X_Stack_2, y_2)

meta_learner_pred = meta_learner.predict(X_Stack_2)
meta_learner_classification_report = classification_report(y_2, meta_learner_pred)
print("Meta learner Classification report:\n",meta_learner_classification_report)
meta_learner_cm = confusion_matrix(y_2, meta_learner_pred)
print("Meta learner Confusion Matrix:\n",meta_learner_cm)
accuracy_score_meta2 = accuracy_score(y_2, meta_learner_pred)
print("Meta learner Accuracy:\n", accuracy_score_meta2)

Meta learner Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1262
           1       1.00      1.00      1.00         3

    accuracy                           1.00      1265
   macro avg       1.00      1.00      1.00      1265
weighted avg       1.00      1.00      1.00      1265

Meta learner Confusion Matrix:
 [[1262    0]
 [   0    3]]
Meta learner Accuracy:
 1.0


# 4 Train Model 2: k-fold Cross Validation

In [22]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

X = df.iloc[:, 0:9]  # Features
y = df.iloc[:, 10]    # Target

# Random Forest classifier
clf_group = RandomForestClassifier(n_estimators=100, random_state=42)  # Default settings

# Setup cross-validation
cv = StratifiedKFold(n_splits=10)
scores = []
conf_matrix_list = []

# Perform cross-validation
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the model
    clf_group.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = clf_group.predict(X_test)
    
    # Accuracy score for this fold
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    
    # Compute confusion matrix for this fold
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_list.append(conf_matrix)

# Output the results
print("Cross-validation scores:", scores)
print("Average cross-validation score: {:.2f}".format(np.mean(scores)))

# Combine the confusion matrices from each fold
combined_conf_matrix = np.sum(conf_matrix_list, axis=0)
print("Combined Confusion Matrix:\n", combined_conf_matrix)


Cross-validation scores: [0.9672977624784854, 0.9621342512908778, 0.9655765920826161, 0.9672977624784854, 0.9707401032702238, 0.963855421686747, 0.9672977624784854, 0.9637931034482758, 0.9724137931034482, 0.9724137931034482]
Average cross-validation score: 0.97
Combined Confusion Matrix:
 [[5576   33]
 [ 157   41]]
