In [26]:
import pandas as pd
import optuna


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score, confusion_matrix


In [17]:
X_train = pd.read_csv("data/x_train.csv", header=None)
X_test = pd.read_csv("data/x_text.csv", header=None)
y_train = pd.read_csv("data/y_train.csv", header=None)
y_test = pd.read_csv("data/y_test.csv", header=None)



print("X train ",X_train.shape)
print("X test",X_test.shape)
print("Y_Train", y_train.shape)
print("Y_test", y_test.shape)

X train  (3417, 1000)
X test (375, 1000)
Y_Train (3417, 1)
Y_test (375, 1)


In [18]:
print((y_train.iloc[:,0].value_counts() / y_train.shape[0]) * 100)


0
 1    55.5458
-1    44.4542
Name: count, dtype: float64


In [19]:

def objective(trial):
    C = trial.suggest_loguniform('C', 1e-5, 1e2)
    max_iter = trial.suggest_int('max_iter', 50, 500)
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'saga'])
    penalty = trial.suggest_categorical('penalty', ['l2'])

    # Create and train the Logistic Regression model
    model = LogisticRegression(C=C, max_iter=max_iter, solver=solver, penalty=penalty, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)

    return precision

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Print the best hyperparameters found
print(f"Best hyperparameters: {study.best_params}")
print(f"Best precision: {study.best_value}")


# Ran for: 3m 7.8s

[I 2024-11-30 22:33:28,231] A new study created in memory with name: no-name-c5e4b6d5-5ed1-44d7-a2ff-0abd9a91aa68
  C = trial.suggest_loguniform('C', 1e-5, 1e2)
  y = column_or_1d(y, warn=True)
[I 2024-11-30 22:33:36,993] Trial 0 finished with value: 0.9027027027027027 and parameters: {'C': 0.0817002368471565, 'max_iter': 113, 'solver': 'saga', 'penalty': 'l2'}. Best is trial 0 with value: 0.9027027027027027.
  C = trial.suggest_loguniform('C', 1e-5, 1e2)
  y = column_or_1d(y, warn=True)
[I 2024-11-30 22:33:37,358] Trial 1 finished with value: 0.8986666666666666 and parameters: {'C': 1.8834308359153027e-05, 'max_iter': 470, 'solver': 'lbfgs', 'penalty': 'l2'}. Best is trial 0 with value: 0.9027027027027027.
  C = trial.suggest_loguniform('C', 1e-5, 1e2)
  y = column_or_1d(y, warn=True)
[I 2024-11-30 22:33:37,786] Trial 2 finished with value: 0.8983957219251337 and parameters: {'C': 5.397745686302095e-05, 'max_iter': 176, 'solver': 'lbfgs', 'penalty': 'l2'}. Best is trial 0 with value: 

Best hyperparameters: {'C': 0.006739078892927555, 'max_iter': 483, 'solver': 'liblinear', 'penalty': 'l2'}
Best accuracy: 0.9051490514905149


Best hyperparameters: {'C': 0.006739078892927555, 'max_iter': 483, 'solver': 'liblinear', 'penalty': 'l2'}
Best accuracy: 0.9051490514905149

In [20]:
# logistic regression with l2 regularization.
param = {'C': 0.006739078892927555, 'max_iter': 483, 'solver': 'liblinear', 'penalty': 'l2'}
model = LogisticRegression(**param)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

  y = column_or_1d(y, warn=True)


In [27]:

def classification_metrics(y_test, y_pred):
    print("="*50)
    print("The accuracy score", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 score:", f1_score(y_test, y_pred))
    print("="*50)
    print(classification_report(y_test, y_pred))
    print("\n")
    print("="*50)
    print("confusion Matrix")
    print(confusion_matrix(y_test, y_pred))
    print("="*50)


classification_metrics(y_test, y_pred)

The accuracy score 0.8986666666666666
Precision: 0.9051490514905149
Recall: 0.9910979228486647
F1 score: 0.9461756373937678
              precision    recall  f1-score   support

          -1       0.50      0.08      0.14        38
           1       0.91      0.99      0.95       337

    accuracy                           0.90       375
   macro avg       0.70      0.54      0.54       375
weighted avg       0.86      0.90      0.86       375



confusion Matrix
[[  3  35]
 [  3 334]]


In [9]:
y_test.value_counts() / y_test.shape[0]

 1    0.898667
-1    0.101333
Name: count, dtype: float64

In [25]:
d = pd.DataFrame(y_pred)
d.value_counts()

 1    369
-1      6
Name: count, dtype: int64