In [1]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from transformers.Rebalancer import Rebalancer
from utils.Common import Config
import pandas as pd


In [2]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [3]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label )
new_df = pc.getFrame()

In [4]:
# resampling
new_df = Rebalancer().process(new_df, Config.label)

In [5]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [6]:
# pass feature to pipeline and convert it to numerical data
result = DataPipeline(Config.num_attribs,Config.cat_attribs).process(X)

In [7]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(result, Y, test_size=0.2, random_state=42, stratify=Y)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (22793, 141)
y_train shape: (22793,)
X_test shape: (5699, 141)
y_test shape: (5699,)


In [8]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression

# Creating a logistic regression model
lr_model = LogisticRegression(solver="sag")

# Performing 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(lr_model, X_train, y_train, cv=cv)

# Printing the mean score and standard deviation
print(f"Cross-Validation Scores: {scores}")
print(f"Mean Score: {scores.mean()}")
print(f"Standard Deviation: {scores.std()}")

Cross-Validation Scores: [0.69116034 0.69159903 0.6999342  0.67902589 0.7064502 ]
Mean Score: 0.6936339318309279
Standard Deviation: 0.009247749718676859
