In [None]:
# Logistic Regression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
dataset = pd.read_csv('/content/ReducedTweets.csv')

# Remove NaN values
cols_with_nan = dataset.columns[dataset.isna().any()].tolist()
for col in cols_with_nan:
    if dataset[col].isna().any():
        dataset[col] = dataset.groupby('Class')[col].transform(lambda x: x.fillna(x.mode().iloc[0]))
    else:
        print(f"")

X = dataset.iloc[:, 1:16].values
y = dataset.iloc[:, 16].values

feature_names = dataset.columns[1:43]

# Split the data into training, testing, and validation
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Define the hyperparameters
lr_parameters = {'C': [10**i for i in range(-3, 4)], 'penalty': ['l1','l2']}

# Nested cross-validation strategy
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize metric lists
lr_accuracy_scores = []
lr_precision_scores = []
lr_recall_scores = []
lr_f1_scores = []

# list of importance scores
feature_importance_scores = []

# Perform nested cross-validation
for train_index, test_index in outer_cv.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    lr_classifier = LogisticRegression(random_state=42)
    lr_grid_search = GridSearchCV(lr_classifier, lr_parameters, cv=inner_cv)
    lr_grid_search.fit(X_train_fold, y_train_fold)
    lr_best_params = lr_grid_search.best_params_
    lr_best_model = LogisticRegression(random_state=42, **lr_best_params)

    # Ablation Study
    ablation_scores = []
    for feature_index in range(X_train_fold.shape[1]):
        ablated_X_train = np.delete(X_train_fold, feature_index, axis=1)
        ablated_X_test = np.delete(X_test_fold, feature_index, axis=1)
        lr_best_model.fit(ablated_X_train, y_train_fold)
        lr_predictions = lr_best_model.predict(ablated_X_test)
        ablation_scores.append(accuracy_score(y_test_fold, lr_predictions))

    # Store importance scores
    feature_importance_scores.append(ablation_scores)

    # Evaluate performance metrics
    lr_accuracy_scores.append(np.mean(ablation_scores))
    lr_precision_scores.append(precision_score(y_test_fold, lr_predictions))
    lr_recall_scores.append(recall_score(y_test_fold, lr_predictions))
    lr_f1_scores.append(f1_score(y_test_fold, lr_predictions))

# Average performance scores
lr_average_accuracy = np.mean(lr_accuracy_scores)
lr_average_precision = np.mean(lr_precision_scores)
lr_average_recall = np.mean(lr_recall_scores)
lr_average_f1 = np.mean(lr_f1_scores)
lr_std_accuracy = np.std(lr_accuracy_scores)
lr_std_precision = np.std(lr_precision_scores)
lr_std_recall = np.std(lr_recall_scores)
lr_std_f1 = np.std(lr_f1_scores)

# Print performance scores
print("\nLogistic Regression:")
print("Average Accuracy:", round(lr_average_accuracy, 4))
print("Standard Deviation Accuracy:", round(lr_std_accuracy, 4))
print("Average Precision:", round(lr_average_precision, 4))
print("Standard Deviation Precision:", round(lr_std_precision, 4))
print("Average Recall:", round(lr_average_recall, 4))
print("Standard Deviation Recall:", round(lr_std_recall, 4))
print("Average F1-score:", round(lr_average_f1, 4))
print("Standard Deviation F1-score:", round(lr_std_f1, 4))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for al


Logistic Regression:
Average Accuracy: 0.849
Standard Deviation Accuracy: 0.0071
Average Precision: 0.862
Standard Deviation Precision: 0.0124
Average Recall: 0.8345
Standard Deviation Recall: 0.0166
Average F1-score: 0.8479
Standard Deviation F1-score: 0.0087


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
best_feature_indices = np.argsort(np.mean(feature_importance_scores, axis=0))[::-1]
num_features_to_display = 10
X_best_features = X[:, best_feature_indices[:num_features_to_display]]
average_importance_scores = np.mean(feature_importance_scores, axis=0)
sorted_indices = np.argsort(average_importance_scores)[::-1]

print("\nTop Feature Importance:")
for i in range(num_features_to_display):
    feature_index = sorted_indices[i]
    importance_score = round(average_importance_scores[feature_index], 5)
    print(f"{feature_names[feature_index]}: {importance_score}")


Top Feature Importance:
total_characters: 0.85333
count_punctuations: 0.85333
count_special_characters: 0.85317
count_unique_words: 0.853
flesch_reading_ease: 0.85233
num_uppercase_chars: 0.85225
gunning_fog: 0.85158
linsear_write: 0.8515
num_lowercase_chars: 0.85033
ari: 0.848
