# Logistic Regression

### Data Preprocessing

In [25]:
# Imports
import os
import sys
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay


In [26]:
# Import the data cleaning function
scripts_directory = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, scripts_directory)
from scripts import data_cleaning

In [39]:
# Load in the test set
data = pd.read_csv("../data/train.csv")

In [40]:
# Apply the data cleaning function
data_clean = data_cleaning.clean_data(data)

In [41]:
# Separate the features and target variable
X = data_clean.drop("rating_rejected", axis=1)
y = data_clean['rating_rejected']

In [42]:
# Drop comment text feature
X = X.drop('comment_text', axis=1)

In [43]:
# Create data splits
cv = KFold(n_splits=4, shuffle=True, random_state=42)

In [44]:
# Initialize a scaler
scaler = StandardScaler()

In [None]:
# Initialize a logistic regression object
log_reg = LogisticRegression(penalty='elasticnet', solver='saga', random_state=42)

In [89]:
# Make data processing pipeline
steps = [
    ('scaler', scaler),
    ('log_reg', log_reg)
    ]

regression_pip = Pipeline(steps)

In [47]:
# Define hyperparameters to tune
c = [0.1, 1, 10, 100]
l1_ratio = [0, 0.1, 0.5, 0.9, 1.0]

params = {'log_reg__C': c, 'log_reg__l1_ratio': l1_ratio}

In [48]:
# Define scoring metrics
scoring = {'accuracy': 'accuracy', 'f1': 'f1'}

### Model Training and Optimization

In [90]:
# Define the grid search object for cross validation and hyperparameter tuning
grid = GridSearchCV(regression_pip, params, cv=cv, scoring=scoring, refit='accuracy', n_jobs=-1)

In [91]:
# Run the grid search
grid.fit(X, y)

0,1,2
,estimator,Pipeline(step...ver='saga'))])
,param_grid,"{'log_reg__C': [0.1, 1, ...], 'log_reg__l1_ratio': [0, 0.1, ...]}"
,scoring,"{'accuracy': 'accuracy', 'f1': 'f1'}"
,n_jobs,-1
,refit,'accuracy'
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'elasticnet'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,1000


### Evaluate Performance 

In [92]:
# Analyze grid search results

grid_results = grid.cv_results_

grid_df = pd.DataFrame(grid_results)
grid_df['total_time'] = (grid_df['mean_fit_time'] + grid_df['mean_score_time']) * 4
columns = ['total_time', 'param_log_reg__C', 'param_log_reg__l1_ratio', 'mean_test_accuracy', 'mean_test_f1', 'rank_test_accuracy']
grid_df = grid_df[columns]

column_names = {
    'total_time': 'Time',
    'param_log_reg__C': 'C',
    'param_log_reg__l1_ratio': 'L1 Ratio',
    'mean_test_accuracy': 'Accuracy',
    'mean_test_f1': 'F1 Score',
    'rank_test_accuracy': 'Rank'}
grid_df = grid_df.rename(columns=column_names)

grid_df = grid_df.sort_values(by='Rank')

print("Model Results Table")
print(grid_df)


Model Results Table
         Time      C  L1 Ratio  Accuracy  F1 Score  Rank
9   32.004297    1.0       1.0  0.935168  0.077161     1
17  33.063114  100.0       0.5  0.935168  0.077161     1
16  32.698971  100.0       0.1  0.935168  0.077161     1
15  24.936480  100.0       0.0  0.935168  0.077161     1
14  30.650036   10.0       1.0  0.935168  0.077161     1
13  30.659848   10.0       0.9  0.935168  0.077161     1
12  30.624847   10.0       0.5  0.935168  0.077161     1
11  31.439581   10.0       0.1  0.935168  0.077161     1
10  24.595890   10.0       0.0  0.935168  0.077161     1
19  25.422564  100.0       1.0  0.935168  0.077161     1
8   34.995411    1.0       0.9  0.935168  0.077161     1
18  30.104901  100.0       0.9  0.935168  0.077161     1
1   32.736836    0.1       0.1  0.935167  0.076897    13
0   24.889808    0.1       0.0  0.935167  0.076897    13
7   33.749648    1.0       0.5  0.935167  0.077144    15
6   35.724446    1.0       0.1  0.935167  0.077127    16
5   27.0362

In [None]:
# Evaluate information from the best model

best_model = grid.best_estimator_
best_params = grid.best_params_
best_C = best_params['param_log_reg__C']
best_l1_ratio = best_params['param_log_reg__l1_ratio']
best_score = grid.best_score_
best_model_index = grid.best_index_

print("Optimal Model:")


In [None]:
# Calculate performance metrics for the best model

acc = accuracy_score(best_model)