In [1]:
# Import necessary libraries
import gdown
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive
drive.mount('/content/drive')

# Download the file from Google Drive
data = "/content/drive/MyDrive/SICSS colab/Copy of CES_With_Rankings.csv"

# Load the datasets
data = pd.read_csv(data)
print(len(data))

Mounted at /content/drive
6893


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report, hamming_loss, precision_score, recall_score, f1_score

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,caseid,lookupzip,inputstate,birthyr,gender4,educ,race,hispanic,industry,...,CC22_300d_5,CC22_300d_6,CC22_300b_1,CC22_300b_2,CC22_300b_3,CC22_300b_4,CC22_300b_5,CC22_300b_6,CC22_300b_7,CC22_300b_8
0,0,1983126005,48152,26,1992,1,6,1,2.0,17.0,...,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0
1,14,1983127719,4530,23,1974,1,6,1,2.0,13.0,...,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
2,20,1983126033,91801,6,1985,1,6,1,1.0,13.0,...,2.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0
3,34,1982290075,90272,6,1956,1,1,1,2.0,10.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0
4,35,1983202213,99336,53,1989,2,2,1,2.0,10.0,...,2.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0


In [4]:
# Selecting the features and target variables
target_columns = ['CC22_300b_1', 'CC22_300b_2', 'CC22_300b_3', 'CC22_300b_4', 'CC22_300b_5', 'CC22_300b_6', 'CC22_300b_7', 'CC22_300b_8']
data[target_columns] = data[target_columns].replace({2: 0, 1: 1})
features = data.drop(columns=target_columns + ['Unnamed: 0', 'caseid'])

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)
targets = data[target_columns]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_imputed, targets, test_size=0.2, random_state=42)

y_test = y_test.reset_index(drop=True)

###Random Forest Classifier (for each target, and overall)

In [5]:
# Initialize the RandomForestClassifier
model_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model for each target variable
for target in target_columns:
    print(f'Training model for {target}')
    model_random_forest.fit(X_train, y_train[target])
    predictions = model_random_forest.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test[target], predictions)
    print(f'Accuracy for {target}: {accuracy:.2f}')
    print(classification_report(y_test[target], predictions))

# Making predictions on the test set for demonstration purposes
predictions = model_random_forest.predict(X_test)

print(predictions)

Training model for CC22_300b_1
Accuracy for CC22_300b_1: 0.63
              precision    recall  f1-score   support

         0.0       0.67      0.71      0.69       793
         1.0       0.57      0.52      0.54       586

    accuracy                           0.63      1379
   macro avg       0.62      0.61      0.61      1379
weighted avg       0.62      0.63      0.62      1379

Training model for CC22_300b_2
Accuracy for CC22_300b_2: 0.62
              precision    recall  f1-score   support

         0.0       0.65      0.75      0.69       791
         1.0       0.57      0.46      0.51       588

    accuracy                           0.62      1379
   macro avg       0.61      0.60      0.60      1379
weighted avg       0.62      0.62      0.61      1379

Training model for CC22_300b_3
Accuracy for CC22_300b_3: 0.61
              precision    recall  f1-score   support

         0.0       0.63      0.70      0.66       759
         1.0       0.57      0.49      0.53       6

In [6]:
##MultiOutput Classifier (using random forest model)
model_multioutputClassifier = MultiOutputClassifier(model_random_forest, n_jobs=-1)

# Train the model on the training data
model_multioutputClassifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = model_multioutputClassifier.predict(X_test)

# Ensure predictions are in the correct shape
predictions = pd.DataFrame(predictions, columns=target_columns)

# Calculate accuracy for each target and overall
individual_accuracies = [accuracy_score(y_test.iloc[:, i], predictions.iloc[:, i]) for i in range(y_test.shape[1])]
print(individual_accuracies)

[0.6272661348803481, 0.6229151559100797, 0.6069615663524293, 0.6279912980420594, 0.6620739666424945, 0.6359680928208847, 0.7802755620014503, 0.8825235678027556]


In [7]:
overall_accuracy = (predictions == y_test).all(axis=1).mean()
print(overall_accuracy)

hamming_loss_score = hamming_loss(y_test, predictions)
print(hamming_loss_score)

0.049311094996374184
0.31925308194343727


###XGBoost

In [8]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, hamming_loss, classification_report

# Initialize the XGBoost classifier and wrap it in a MultiOutputClassifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
multi_xgb_model = MultiOutputClassifier(xgb_model, n_jobs=-1)

# Train the model on the training data
multi_xgb_model.fit(X_train, y_train)

# Make predictions on the test data
xgb_predictions = multi_xgb_model.predict(X_test)

# Ensure predictions are in the correct shape and reset indices
xgb_predictions = pd.DataFrame(xgb_predictions, columns=target_columns).reset_index(drop=True)

# Calculate overall accuracy (exact match ratio)
xgb_overall_accuracy = (xgb_predictions == y_test).all(axis=1).mean()

# Calculate Hamming loss
xgb_hamming_loss_score = hamming_loss(y_test, xgb_predictions)

# Calculate individual accuracies for each target variable
xgb_individual_accuracies = [accuracy_score(y_test.iloc[:, i], xgb_predictions.iloc[:, i]) for i in range(y_test.shape[1])]

# Generate classification reports for each target variable
xgb_classification_reports = [classification_report(y_test.iloc[:, i], xgb_predictions.iloc[:, i]) for i in range(y_test.shape[1])]

# Print results
xgb_results = {
    "Overall Accuracy": xgb_overall_accuracy,
    "Hamming Loss": xgb_hamming_loss_score,
    "Individual Accuracies": xgb_individual_accuracies,
    "Classification Reports": xgb_classification_reports
}

print(xgb_results)


{'Overall Accuracy': 0.04133430021754895, 'Hamming Loss': 0.3376540971718637, 'Individual Accuracies': [0.6200145032632343, 0.6156635242929659, 0.5866569978245105, 0.6178390137781, 0.6432197244379986, 0.5924583031182016, 0.7541696881798404, 0.8687454677302393], 'Classification Reports': ['              precision    recall  f1-score   support\n\n         0.0       0.66      0.69      0.68       793\n         1.0       0.56      0.53      0.54       586\n\n    accuracy                           0.62      1379\n   macro avg       0.61      0.61      0.61      1379\nweighted avg       0.62      0.62      0.62      1379\n', '              precision    recall  f1-score   support\n\n         0.0       0.65      0.71      0.68       791\n         1.0       0.56      0.49      0.52       588\n\n    accuracy                           0.62      1379\n   macro avg       0.60      0.60      0.60      1379\nweighted avg       0.61      0.62      0.61      1379\n', '              precision    recall 

###Neural Networks

In [9]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# # Define the neural network model
# def create_model(input_dim, output_dim):
#     model = Sequential()
#     model.add(Dense(512, activation='relu', input_dim=input_dim))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.5))
#     model.add(Dense(256, activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.5))
#     model.add(Dense(128, activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.5))
#     model.add(Dense(64, activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.5))
#     model.add(Dense(output_dim, activation='sigmoid'))  # sigmoid for binary classification
#     return model

# input_dim = X_train.shape[1]
# output_dim = y_train.shape[1]
# model = create_model(input_dim, output_dim)

In [10]:
# # Compile the model
# model.compile(optimizer='adam',
#               loss='binary_crossentropy',  # binary crossentropy for binary multi-label classification
#               metrics=['accuracy'])

# # Summary of the model
# model.summary()

In [11]:
# # Train the model
# history = model.fit(X_train, y_train, epochs=2000, batch_size=64, validation_split=0.2, verbose=2)


In [12]:
# # Make predictions on the test data
# predictions = model.predict(X_test)
# predictions = (predictions > 0.5).astype(int)

# # Calculate overall accuracy (exact match ratio)
# overall_accuracy = (predictions == y_test.values).all(axis=1).mean()

# # Calculate Hamming loss
# from sklearn.metrics import hamming_loss
# hamming_loss_score = hamming_loss(y_test, predictions)

# # Calculate individual accuracies for each target variable
# from sklearn.metrics import accuracy_score, classification_report
# individual_accuracies = [accuracy_score(y_test.iloc[:, i], predictions[:, i]) for i in range(y_test.shape[1])]

# # Generate classification reports for each target variable
# classification_reports = [classification_report(y_test.iloc[:, i], predictions[:, i]) for i in range(y_test.shape[1])]

# # Print results
# results = {
#     "Overall Accuracy": overall_accuracy,
#     "Hamming Loss": hamming_loss_score,
#     "Individual Accuracies": individual_accuracies,
#     "Classification Reports": classification_reports
# }

# print(results)


###Transformer Model

In [13]:
# # Reshape data to fit the Transformer model
# X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
# X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))


In [14]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer
# import tensorflow as tf
# from tensorflow.keras.layers import Dense, LayerNormalization, Dropout, MultiHeadAttention, Input, GlobalAveragePooling1D
# from tensorflow.keras.models import Model
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import EarlyStopping
# from sklearn.metrics import accuracy_score, classification_report, hamming_loss


# # Define the Transformer Encoder block
# class TransformerBlock(tf.keras.layers.Layer):
#     def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
#         super(TransformerBlock, self).__init__()
#         self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
#         self.ffn = tf.keras.Sequential(
#             [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
#         )
#         self.layernorm1 = LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = LayerNormalization(epsilon=1e-6)
#         self.dropout1 = Dropout(rate)
#         self.dropout2 = Dropout(rate)

#     def call(self, inputs, training):
#         attn_output = self.att(inputs, inputs)
#         attn_output = self.dropout1(attn_output, training=training)
#         out1 = self.layernorm1(inputs + attn_output)
#         ffn_output = self.ffn(out1)
#         ffn_output = self.dropout2(ffn_output, training=training)
#         return self.layernorm2(out1 + ffn_output)

# # Define the Transformer model
# def create_transformer_model(input_shape, output_dim, embed_dim=64, num_heads=2, ff_dim=128, num_layers=2, rate=0.1):
#     inputs = Input(shape=input_shape)
#     x = Dense(embed_dim)(inputs)
#     for _ in range(num_layers):
#         x = TransformerBlock(embed_dim, num_heads, ff_dim, rate)(x)
#     x = GlobalAveragePooling1D()(x)
#     x = Dropout(rate)(x)
#     x = Dense(64, activation="relu")(x)
#     x = Dropout(rate)(x)
#     outputs = Dense(output_dim, activation="sigmoid")(x)
#     model = Model(inputs=inputs, outputs=outputs)
#     return model

# input_shape = (X_train.shape[1], 1)  # Assuming each feature as a "token" with a single value
# output_dim = y_train.shape[1]
# transformer_model = create_transformer_model(input_shape, output_dim)

# # Define a higher learning rate
# learning_rate = 0.01  # Increase this value as needed

# # Compile the model with the higher learning rate
# optimizer = Adam(learning_rate=learning_rate)
# transformer_model.compile(optimizer=optimizer,
#                           loss='binary_crossentropy',
#                           metrics=['accuracy'])

# # Summary of the model
# transformer_model.summary()

# # # Compile the model
# # transformer_model.compile(optimizer='adam',
# #                           loss='binary_crossentropy',
# #                           metrics=['accuracy'])

# # # Summary of the model
# # transformer_model.summary()

# # Define early stopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# # Train the model
# history = transformer_model.fit(X_train_reshaped, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=2, callbacks=[early_stopping])

# # Make predictions on the test data
# predictions = transformer_model.predict(X_test_reshaped)
# predictions = (predictions > 0.5).astype(int)

# # Calculate overall accuracy (exact match ratio)
# overall_accuracy = (predictions == y_test.values).all(axis=1).mean()

# # Calculate Hamming loss
# hamming_loss_score = hamming_loss(y_test, predictions)

# # Calculate individual accuracies for each target variable
# individual_accuracies = [accuracy_score(y_test.iloc[:, i], predictions[:, i]) for i in range(y_test.shape[1])]

# # Generate classification reports for each target variable
# classification_reports = [classification_report(y_test.iloc[:, i], predictions[:, i]) for i in range(y_test.shape[1])]

# # Print results
# results = {
#     "Overall Accuracy": overall_accuracy,
#     "Hamming Loss": hamming_loss_score,
#     "Individual Accuracies": individual_accuracies,
#     "Classification Reports": classification_reports
# }

# print(results)

##Trying with a diff dataset!

In [15]:
### SPLIT INDICATOR

# features = ["lookupzip", "inputstate", "birthyr", "gender4", "educ", "race", "hispanic", "industry", "employ", "houseincome"]
#features = ["inputstate", "birthyr", "gender4", "educ", "race", "hispanic", "industry", "employ", "houseincome", "numchildren"]

features=[]
# data = pd.read_csv(path+"CES_With_Rankings.csv")
data = pd.read_csv("/content/drive/MyDrive/SICSS colab/complete_complete.csv")

na_columns = [col for col in data.columns if 'NA' in col]
data = data[~data[na_columns].eq(1).any(axis=1)]

# Reset the index
data.reset_index(drop=True, inplace=True)


for item in data:
  if "CC22_300b" not in item and item != "Unnamed: 0":
    features.append(item)

print(features)

targets = []

for item in data:
  if "CC22_300b" in item:
    targets.append(item)

print(targets)
X = data[features].values
print(X)
y = data[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


['data.birthyr', 'inputstate_1', 'inputstate_2', 'inputstate_4', 'inputstate_5', 'inputstate_6', 'inputstate_8', 'inputstate_9', 'inputstate_10', 'inputstate_11', 'inputstate_12', 'inputstate_13', 'inputstate_15', 'inputstate_16', 'inputstate_17', 'inputstate_18', 'inputstate_19', 'inputstate_20', 'inputstate_21', 'inputstate_22', 'inputstate_23', 'inputstate_24', 'inputstate_25', 'inputstate_26', 'inputstate_27', 'inputstate_28', 'inputstate_29', 'inputstate_30', 'inputstate_31', 'inputstate_32', 'inputstate_33', 'inputstate_34', 'inputstate_35', 'inputstate_36', 'inputstate_37', 'inputstate_38', 'inputstate_39', 'inputstate_40', 'inputstate_41', 'inputstate_42', 'inputstate_44', 'inputstate_45', 'inputstate_46', 'inputstate_47', 'inputstate_48', 'inputstate_49', 'inputstate_50', 'inputstate_51', 'inputstate_53', 'inputstate_54', 'inputstate_55', 'inputstate_56', 'inputstate_NA', 'gender4_1', 'gender4_2', 'gender4_3', 'gender4_4', 'gender4_NA', 'educ_1', 'educ_2', 'educ_3', 'educ_4', 

In [16]:
# Initialize the RandomForestClassifier
model_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model for each target variable
for target in targets:

    print(f'Training model for {target}')
    model_random_forest.fit(X_train, y_train[target])
    predictions = model_random_forest.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test[target], predictions)
    print(f'Accuracy for {target}: {accuracy:.2f}')
    print(classification_report(y_test[target], predictions))

# Making predictions on the test set for demonstration purposes
predictions = model_random_forest.predict(X_test)

print(predictions)

Training model for CC22_300b_1_1
Accuracy for CC22_300b_1_1: 0.83
              precision    recall  f1-score   support

           0       0.84      0.98      0.91      1159
           1       0.29      0.05      0.08       219

    accuracy                           0.83      1378
   macro avg       0.57      0.51      0.49      1378
weighted avg       0.76      0.83      0.78      1378

Training model for CC22_300b_1_2
Accuracy for CC22_300b_1_2: 0.79
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1042
           1       0.59      0.46      0.52       336

    accuracy                           0.79      1378
   macro avg       0.71      0.68      0.69      1378
weighted avg       0.78      0.79      0.78      1378

Training model for CC22_300b_1_NA
Accuracy for CC22_300b_1_NA: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1378

    accuracy                        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for CC22_300b_7_2: 0.92
              precision    recall  f1-score   support

           0       0.96      0.91      0.94       910
           1       0.84      0.93      0.89       468

    accuracy                           0.92      1378
   macro avg       0.90      0.92      0.91      1378
weighted avg       0.92      0.92      0.92      1378

Training model for CC22_300b_7_NA
Accuracy for CC22_300b_7_NA: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1378

    accuracy                           1.00      1378
   macro avg       1.00      1.00      1.00      1378
weighted avg       1.00      1.00      1.00      1378

Training model for CC22_300b_8_1
Accuracy for CC22_300b_8_1: 0.95
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1308
           1       0.00      0.00      0.00        70

    accuracy                           0.95      1378
   macro avg   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for CC22_300b_8_2: 0.93
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       893
           1       0.87      0.93      0.90       485

    accuracy                           0.93      1378
   macro avg       0.92      0.93      0.92      1378
weighted avg       0.93      0.93      0.93      1378

Training model for CC22_300b_8_NA
Accuracy for CC22_300b_8_NA: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1378

    accuracy                           1.00      1378
   macro avg       1.00      1.00      1.00      1378
weighted avg       1.00      1.00      1.00      1378

[0 0 0 ... 0 0 0]


In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the RandomForestClassifier
model_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Assume targets is a list of target variable names
# Assume X_train, X_test, y_train, y_test are predefined DataFrames or arrays

for target in targets:
    print(f'Training model for {target}')
    model_random_forest.fit(X_train, y_train[target])
    predictions = model_random_forest.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test[target], predictions)
    print(f'Accuracy for {target}: {accuracy:.2f}')
    #print(classification_report(y_test[target], predictions))

    # Plot feature importance
    feature_importances = model_random_forest.feature_importances_
    sorted_indices = feature_importances.argsort()[::-1]  # Sort in descending order
    top_n = 20
    top_indices = sorted_indices[:top_n]
    top_importances = feature_importances[top_indices]

    if isinstance(X_train, pd.DataFrame):
        top_features = X_train.columns[top_indices]
    else:  # numpy array
        top_features = np.array([f'Feature {i}' for i in range(X_train.shape[1])])[top_indices]

    # plt.figure(figsize=(10, 6))
    # plt.bar(range(len(top_importances)), top_importances, tick_label=top_features)
    # plt.xticks(rotation=90)
    # plt.title(f'Top {top_n} Feature Importances for {target}')
    # plt.xlabel('Features')
    # plt.ylabel('Importance')
    # plt.tight_layout()
    # plt.show()

# Making predictions on the test set for demonstration purposes
predictions = model_random_forest.predict(X_test)

#print(predictions)


Training model for CC22_300b_1_1
Accuracy for CC22_300b_1_1: 0.83
Training model for CC22_300b_1_2
Accuracy for CC22_300b_1_2: 0.79
Training model for CC22_300b_1_NA
Accuracy for CC22_300b_1_NA: 1.00
Training model for CC22_300b_2_1
Accuracy for CC22_300b_2_1: 0.85
Training model for CC22_300b_2_2
Accuracy for CC22_300b_2_2: 0.82
Training model for CC22_300b_2_NA
Accuracy for CC22_300b_2_NA: 1.00
Training model for CC22_300b_3_1
Accuracy for CC22_300b_3_1: 0.84
Training model for CC22_300b_3_2
Accuracy for CC22_300b_3_2: 0.80
Training model for CC22_300b_3_NA
Accuracy for CC22_300b_3_NA: 1.00
Training model for CC22_300b_4_1
Accuracy for CC22_300b_4_1: 0.86
Training model for CC22_300b_4_2
Accuracy for CC22_300b_4_2: 0.82
Training model for CC22_300b_4_NA
Accuracy for CC22_300b_4_NA: 1.00
Training model for CC22_300b_5_1
Accuracy for CC22_300b_5_1: 0.84
Training model for CC22_300b_5_2
Accuracy for CC22_300b_5_2: 0.80
Training model for CC22_300b_5_NA
Accuracy for CC22_300b_5_NA: 1.00


In [19]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost Classifier
model_xgboost = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')

# Assume targets is a list of target variable names
# Assume X_train, X_test, y_train, y_test are predefined DataFrames or arrays

for target in targets:
    print(f'Training model for {target}')
    model_xgboost.fit(X_train, y_train[target])
    predictions = model_xgboost.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test[target], predictions)
    print(f'Accuracy for {target}: {accuracy:.2f}')
    # print(classification_report(y_test[target], predictions))

    # Plot feature importance
    feature_importances = model_xgboost.feature_importances_
    sorted_indices = feature_importances.argsort()[::-1]  # Sort in descending order
    top_n = 20
    top_indices = sorted_indices[:top_n]
    top_importances = feature_importances[top_indices]

    if isinstance(X_train, pd.DataFrame):
        top_features = X_train.columns[top_indices]
    else:  # numpy array
        top_features = np.array([f'Feature {i}' for i in range(X_train.shape[1])])[top_indices]

    # plt.figure(figsize=(10, 6))
    # plt.bar(range(len(top_importances)), top_importances, tick_label=top_features)
    # plt.xticks(rotation=90)
    # plt.title(f'Top {top_n} Feature Importances for {target}')
    # plt.xlabel('Features')
    # plt.ylabel('Importance')
    # plt.tight_layout()
    # plt.show()

# Making predictions on the test set for demonstration purposes
predictions = model_xgboost.predict(X_test)

# print(predictions)


Training model for CC22_300b_1_1
Accuracy for CC22_300b_1_1: 0.82
Training model for CC22_300b_1_2
Accuracy for CC22_300b_1_2: 0.82
Training model for CC22_300b_1_NA
Accuracy for CC22_300b_1_NA: 1.00
Training model for CC22_300b_2_1
Accuracy for CC22_300b_2_1: 0.83
Training model for CC22_300b_2_2
Accuracy for CC22_300b_2_2: 0.82
Training model for CC22_300b_2_NA
Accuracy for CC22_300b_2_NA: 1.00
Training model for CC22_300b_3_1
Accuracy for CC22_300b_3_1: 0.81
Training model for CC22_300b_3_2
Accuracy for CC22_300b_3_2: 0.81
Training model for CC22_300b_3_NA
Accuracy for CC22_300b_3_NA: 1.00
Training model for CC22_300b_4_1
Accuracy for CC22_300b_4_1: 0.82
Training model for CC22_300b_4_2
Accuracy for CC22_300b_4_2: 0.81
Training model for CC22_300b_4_NA
Accuracy for CC22_300b_4_NA: 1.00
Training model for CC22_300b_5_1
Accuracy for CC22_300b_5_1: 0.81
Training model for CC22_300b_5_2
Accuracy for CC22_300b_5_2: 0.81
Training model for CC22_300b_5_NA
Accuracy for CC22_300b_5_NA: 1.00
