In [10]:
 import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the cleansed data
data = pd.read_csv('data/cleaned_data.csv')

# Splitting the data into features and labels
X = data.drop('Decision', axis=1)
y = data['Decision']

# Identifying categorical columns for one-hot encoding
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: One-hot encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fitting the preprocessor and transforming the training and test feature sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Initializing and training the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_processed, y_train)

# Predicting and evaluating the model
y_pred_logreg = logreg.predict(X_test_processed)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
logreg_report = classification_report(y_test, y_pred_logreg)

print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Logistic Regression Classification Report:\n{logreg_report}")

Logistic Regression Accuracy: 0.6464439900022723
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.85      0.73      2372
           1       0.67      0.56      0.61      1390
           2       0.00      0.00      0.00        34
           7       0.46      0.09      0.15       561
           9       0.00      0.00      0.00         5
          12       0.00      0.00      0.00        39

    accuracy                           0.65      4401
   macro avg       0.30      0.25      0.25      4401
weighted avg       0.62      0.65      0.61      4401



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the cleansed data
data = pd.read_csv('data/cleaned_data.csv')

# Splitting the data into features and labels
X = data.drop('Decision', axis=1)
y = data['Decision']

# Identifying categorical columns for one-hot encoding
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: One-hot encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fitting the preprocessor and transforming the training and test feature sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Initializing and training the Decision Tree model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_processed, y_train)

# Predicting and evaluating the model
y_pred_dtree = dtree.predict(X_test_processed)
dtree_accuracy = accuracy_score(y_test, y_pred_dtree)
dtree_report = classification_report(y_test, y_pred_dtree)

print(f"Decision Tree Accuracy: {dtree_accuracy}")
print(f"Decision Tree Classification Report:\n{dtree_report}")

Decision Tree Accuracy: 0.9972733469665985
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2372
           1       1.00      1.00      1.00      1390
           2       0.94      0.97      0.96        34
           7       1.00      1.00      1.00       561
           9       0.83      1.00      0.91         5
          12       1.00      0.95      0.97        39

    accuracy                           1.00      4401
   macro avg       0.96      0.99      0.97      4401
weighted avg       1.00      1.00      1.00      4401



In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Path to the cleansed data
cleansed_data_path = 'data/cleaned_data.csv'

# Load the cleansed data
data = pd.read_csv(cleansed_data_path)

# Encoding the 'Decision' column to ensure it has continuous classes
label_encoder = LabelEncoder()
data['Decision'] = label_encoder.fit_transform(data['Decision'])

# Splitting the data into features and labels
X = data.drop('Decision', axis=1)
y = data['Decision']

# Identifying categorical columns for one-hot encoding
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: One-hot encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fitting the preprocessor and transforming the training and test feature sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Initializing and training the XGBoost model
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgboost_model.fit(X_train_processed, y_train)

# Predicting and evaluating the model
y_pred_xgboost = xgboost_model.predict(X_test_processed)
xgboost_accuracy = accuracy_score(y_test, y_pred_xgboost)
xgboost_report = classification_report(y_test, y_pred_xgboost)

print(f"XGBoost Accuracy: {xgboost_accuracy}")
print(f"XGBoost Classification Report:\n{xgboost_report}")

XGBoost Accuracy: 0.9947739150193138
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2372
           1       0.99      0.99      0.99      1390
           2       0.89      0.91      0.90        34
           3       1.00      1.00      1.00       561
           4       0.75      0.60      0.67         5
           5       0.95      0.95      0.95        39

    accuracy                           0.99      4401
   macro avg       0.93      0.91      0.92      4401
weighted avg       0.99      0.99      0.99      4401



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load the cleansed data
data_path = 'data/cleaned_data.csv'
data = pd.read_csv(data_path)

# Encoding the 'Decision' column to ensure it has continuous classes
label_encoder = LabelEncoder()
data['Decision'] = label_encoder.fit_transform(data['Decision'])

# Splitting the data into features and labels
X = data.drop('Decision', axis=1)
y = data['Decision']

# Identifying categorical columns for one-hot encoding
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: One-hot encode the categorical features and scale the numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', MinMaxScaler(), X.select_dtypes(include=['float64', 'int64']).columns.tolist())
    ],
    remainder='passthrough'
)

# Fitting the preprocessor and transforming the training and test feature sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Reshaping input data for LSTM [samples, time steps, features]
X_train_processed = X_train_processed.reshape((X_train_processed.shape[0], 1, X_train_processed.shape[1]))
X_test_processed = X_test_processed.reshape((X_test_processed.shape[0], 1, X_test_processed.shape[1]))

# Converting the labels to categorical for use with softmax activation
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train_processed.shape[1], X_train_processed.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y_train_categorical.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_processed, y_train_categorical, epochs=100, batch_size=32, validation_data=(X_test_processed, y_test_categorical), verbose=2)

# Evaluate the model
scores = model.evaluate(X_test_processed, y_test_categorical, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))