In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
df_raw = pd.read_csv("baf.csv")

In [3]:
df_split = df_raw.iloc[:, 0].str.split(';', expand=True)


In [4]:
columns = ["age", "job", "marital", "education", "default", "housing", "loan", 
           "contact", "month", "day_of_week", "duration", "campaign", "pdays", 
           "previous", "poutcome", "emp.var.rate", "cons.price.idx", 
           "cons.conf.idx", "euribor3m", "nr.employed", "y"]
df_split.columns = columns


In [5]:
df_split.columns = df_split.columns.str.replace('"', '')
df_split = df_split.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)


In [6]:
print("Column names in DataFrame:", df_split.columns)

Column names in DataFrame: Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')


In [7]:
target_column = 'day_of_week'

In [8]:
if target_column in df_split.columns:
    y = df_split[target_column]
    X = df_split.drop(target_column, axis=1)
else:
    print(f"Column '{target_column}' not found in the DataFrame. Available columns are: {df_split.columns}")


In [9]:
X = pd.get_dummies(X, drop_first=True)

In [10]:
print("First few rows of feature set (X):")
print(X.head())
print("First few rows of target (y):")
print(y.head())

First few rows of feature set (X):
   age_18  age_19  age_20  age_21  age_22  age_23  age_24  age_25  age_26  \
0   False   False   False   False   False   False   False   False   False   
1   False   False   False   False   False   False   False   False   False   
2   False   False   False   False   False   False   False   False   False   
3   False   False   False   False   False   False   False   False   False   
4   False   False   False   False   False   False   False   False   False   

   age_27  ...  nr.employed_5008.7  nr.employed_5017.5  nr.employed_5023.5  \
0   False  ...               False               False               False   
1   False  ...               False               False               False   
2   False  ...               False               False               False   
3   False  ...               False               False               False   
4   False  ...               False               False               False   

   nr.employed_5076.2  nr.employe

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
scaler= StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [13]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_test)

In [15]:
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)
forest_pred = forest_model.predict(X_test)

In [16]:
print("Logistic Regression:\n", classification_report(y_test, log_pred))
print("Decision Tree:\n", classification_report(y_test, tree_pred))
print("Random Forest:\n", classification_report(y_test, forest_pred))


Logistic Regression:
               precision    recall  f1-score   support

         fri       0.68      0.67      0.67      2313
         mon       0.69      0.73      0.71      2602
         thu       0.76      0.76      0.76      2603
         tue       0.70      0.70      0.70      2436
         wed       0.64      0.60      0.62      2403

    accuracy                           0.69     12357
   macro avg       0.69      0.69      0.69     12357
weighted avg       0.69      0.69      0.69     12357

Decision Tree:
               precision    recall  f1-score   support

         fri       0.73      0.72      0.73      2313
         mon       0.74      0.75      0.74      2602
         thu       0.83      0.82      0.83      2603
         tue       0.71      0.73      0.72      2436
         wed       0.70      0.69      0.70      2403

    accuracy                           0.75     12357
   macro avg       0.74      0.74      0.74     12357
weighted avg       0.75      0.75      

In [17]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
62 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\parvj\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\parvj\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\parvj\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\parvj\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamete

Best Parameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 300}


In [18]:
optimized_forest_model = RandomForestClassifier(**best_params)
optimized_forest_model.fit(X_train, y_train)
optimized_forest_pred = optimized_forest_model.predict(X_test)

In [19]:
print("Optimized Random Forest:\n", classification_report(y_test, optimized_forest_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, optimized_forest_pred))
print("Accuracy Score:\n", accuracy_score(y_test, optimized_forest_pred))

Optimized Random Forest:
               precision    recall  f1-score   support

         fri       0.86      0.56      0.68      2313
         mon       0.67      0.77      0.72      2602
         thu       0.59      0.86      0.70      2603
         tue       0.76      0.64      0.69      2436
         wed       0.74      0.61      0.67      2403

    accuracy                           0.69     12357
   macro avg       0.72      0.69      0.69     12357
weighted avg       0.72      0.69      0.69     12357

Confusion Matrix:
 [[1288  320  360   86  259]
 [   1 2015  312  143  131]
 [ 122   39 2239   78  125]
 [   4  367  504 1548   13]
 [  88  262  405  183 1465]]
Accuracy Score:
 0.6923201424293922
