In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from nltk.corpus import stopwords
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from scipy.sparse import hstack
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import pandas as pd

# UNIGRAM & BIGRAM with/without dummies

In [None]:
# Load the CSV file into a DataFrame
data = pd.read_csv('data_cleaned.csv')

def process_column_both_grams(column_name, data):
    print(f"Processing column {column_name} without dummies...")
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']
    results = {}
    
    for ngram, label in [((1, 1), "ug"), ((1, 2), "bg")]:
        print(f"Processing for ngram {ngram}...")
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram)
        X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Oversampling to address class imbalance
        data_female = data_cleaned[data_cleaned['Article_Gender'] == 'Female']
        data_male = data_cleaned[data_cleaned['Article_Gender'] == 'Male']
        data_female_oversampled = resample(data_female, replace=True, n_samples=len(data_male), random_state=42)
        data_oversampled = pd.concat([data_male, data_female_oversampled])
        X_oversampled = tfidf_vectorizer.transform(data_oversampled[column_name])
        y_oversampled = data_oversampled['Article_Gender']
        X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=42)

        # Hyperparameter tuning for Logistic Regression
        logreg_params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2']
        }
        grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga'),
                                          logreg_params, 
                                          cv=5, 
                                          scoring='accuracy', 
                                          n_jobs=-1)
        grid_search_logreg.fit(X_train_os, y_train_os)
        best_params_logreg = grid_search_logreg.best_params_
        best_score_logreg = grid_search_logreg.best_score_

        # Hyperparameter tuning optimized for predicting 'Female'
        custom_scorer = make_scorer(precision_score, pos_label='Female', zero_division=0)
        param_grid_custom = {
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2'],
        }
        grid_search_custom = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga'), param_grid_custom, scoring=custom_scorer, cv=5)
        grid_search_custom.fit(X, y)
        best_params_custom = grid_search_custom.best_params_
        best_score_custom = grid_search_custom.best_score_

        results[label] = {
            f'best_params_logreg_{label}': best_params_logreg,
            f'best_score_logreg_{label}': best_score_logreg,
            f'best_params_custom_{label}': best_params_custom,
            f'best_score_custom_{label}': best_score_custom
        }

    return results

def process_column_both_grams_with_dummies(column_name, data):
    print(f"Processing column {column_name} with dummies...")
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']
    journal_dummies = pd.get_dummies(data_cleaned['Journal Name'], prefix='Journal')
    results = {}

    for ngram, label in [((1, 1), "ug"), ((1, 2), "bg")]:
        print(f"Processing for ngram {ngram}...")
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram)
        X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])
        X_combined = hstack([X, journal_dummies.values])
        X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

        # Oversampling to address class imbalance
        data_female = data_cleaned[data_cleaned['Article_Gender'] == 'Female']
        data_male = data_cleaned[data_cleaned['Article_Gender'] == 'Male']
        data_female_oversampled = resample(data_female, replace=True, n_samples=len(data_male), random_state=42)
        data_oversampled = pd.concat([data_male, data_female_oversampled])
        X_oversampled = tfidf_vectorizer.transform(data_oversampled[column_name])
        X_combined_oversampled = hstack([X_oversampled, journal_dummies.loc[data_oversampled.index].values])
        y_oversampled = data_oversampled['Article_Gender']
        X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_combined_oversampled, y_oversampled, test_size=0.2, random_state=42)

        # Hyperparameter tuning for Logistic Regression
        logreg_params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2']
        }
        grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga'),
                                          logreg_params, 
                                          cv=5, 
                                          scoring='accuracy', 
                                          n_jobs=-1)
        grid_search_logreg.fit(X_train_os, y_train_os)
        best_params_logreg = grid_search_logreg.best_params_
        best_score_logreg = grid_search_logreg.best_score_

        # Hyperparameter tuning optimized for predicting 'Female'
        custom_scorer = make_scorer(precision_score, pos_label='Female', zero_division=0)
        param_grid_custom = {
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2'],
        }
        grid_search_custom = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga'), param_grid_custom, scoring=custom_scorer, cv=5)
        grid_search_custom.fit(X_combined, y)
        best_params_custom = grid_search_custom.best_params_
        best_score_custom = grid_search_custom.best_score_

        results[label] = {
            f'best_params_logreg_{label}': best_params_logreg,
            f'best_score_logreg_{label}': best_score_logreg,
            f'best_params_custom_{label}': best_params_custom,
            f'best_score_custom_{label}': best_score_custom
        }

    return results

columns_to_process = ['Intro_Cleaned', 'Intro_1', 'Intro_2', 'Intro_3']

results_all = {}

for col in columns_to_process:
    results_all[col] = process_column_both_grams(col, data)
    results_all[f"{col}_with_dummies"] = process_column_both_grams_with_dummies(col, data)

# Display the results in a structured manner
pp = pprint.PrettyPrinter(indent=4)
for col in columns_to_process:
    print(f"\nResults without Dummies for '{col}':")
    pp.pprint(results_all[col])
    print(f"\nResults with Dummies for '{col}':")
    pp.pprint(results_all[f"{col}_with_dummies"])


Processing column Intro_Cleaned without dummies...
Processing for ngram (1, 1)...


In [47]:
import pickle

# After obtaining the results
with open('results_all.pkl', 'wb') as f:
    pickle.dump(results_all, f)

# ... Later, to retrieve the results
with open('results_all.pkl', 'rb') as f:
    results_all_retrieved = pickle.load(f)

# Now you can use `results_all_retrieved` as if it were the original `results_all` dictionary

In [None]:
# Baseline Accuracy Calculation
baseline_accuracy_data_cleaned = max(data_cleaned['Article_Gender'].value_counts(normalize=True))
baseline_accuracy_intro_1 = max(data['Intro_1'].value_counts(normalize=True))
baseline_accuracy_intro_2 = max(data['Intro_2'].value_counts(normalize=True))
baseline_accuracy_intro_3 = max(data['Intro_3'].value_counts(normalize=True))

# List of accuracies for each model type
labels = [
    "Unigram", 
    "Bigram", 
    "Unigram with Dummies", 
    "Bigram with Dummies",
    "Baseline"
]

def plot_accuracies(results, results_dummies, baseline_accuracy, title, ylabel):
    plt.figure(figsize=(15, 7))
    plt.bar(labels, [
        results['ug']['best_score_logreg_ug'], 
        results['bg']['best_score_logreg_bg'],
        results_dummies['ug']['best_score_logreg_ug'], 
        results_dummies['bg']['best_score_logreg_bg'],
        baseline_accuracy
    ], color=['blue', 'green', 'purple', 'cyan', 'red'], alpha=0.7)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.ylim([0.5, 1.0])
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# 1. Best Hyperparameter Tuned Accuracy for data_cleaned
plot_accuracies(results_intro_cleaned, results_intro_cleaned_dummies, baseline_accuracy_data_cleaned, 
                "Comparison of Best Hyperparameter Tuned Model Accuracies for data_cleaned", 
                "Best Tuned Accuracy")

# Best Hyperparameter Tuned Accuracy for Intro_1
plot_accuracies(results_intro_1, results_intro_1_dummies, baseline_accuracy_intro_1, 
                "Comparison of Best Hyperparameter Tuned Model Accuracies for Intro_1", 
                "Best Tuned Accuracy")

# Best Hyperparameter Tuned Accuracy for Intro_2
plot_accuracies(results_intro_2, results_intro_2_dummies, baseline_accuracy_intro_2, 
                "Comparison of Best Hyperparameter Tuned Model Accuracies for Intro_2", 
                "Best Tuned Accuracy")

# Best Hyperparameter Tuned Accuracy for Intro_3
plot_accuracies(results_intro_3, results_intro_3_dummies, baseline_accuracy_intro_3, 
                "Comparison of Best Hyperparameter Tuned Model Accuracies for Intro_3", 
                "Best Tuned Accuracy")

In [48]:
def plot_learning_curve(estimator, title, X, y, cv=None):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.grid(True)
    plt.show()

# Assuming logreg_balanced is already defined
logreg_balanced = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga')

# List to hold datasets and corresponding labels
datasets = [
    (X_oversampled_ug, y_oversampled_ug, "data_cleaned Unigram"),
    (X_oversampled_bg, y_oversampled_bg, "data_cleaned Bigram"),
    (X_oversampled_dum_ug, y_oversampled_dum_ug, "data_cleaned Unigram with Dummies"),
    (X_oversampled_dum_bg, y_oversampled_dum_bg, "data_cleaned Bigram with Dummies"),
    (X_oversampled_ug_intro_1, y_oversampled_ug_intro_1, "Intro_1 Unigram"),
    (X_oversampled_bg_intro_1, y_oversampled_bg_intro_1, "Intro_1 Bigram"),
    (X_oversampled_dum_ug_intro_1, y_oversampled_dum_ug_intro_1, "Intro_1 Unigram with Dummies"),
    (X_oversampled_dum_bg_intro_1, y_oversampled_dum_bg_intro_1, "Intro_1 Bigram with Dummies"),
    (X_oversampled_ug_intro_2, y_oversampled_ug_intro_2, "Intro_2 Unigram"),
    (X_oversampled_bg_intro_2, y_oversampled_bg_intro_2, "Intro_2 Bigram"),
    (X_oversampled_dum_ug_intro_2, y_oversampled_dum_ug_intro_2, "Intro_2 Unigram with Dummies"),
    (X_oversampled_dum_bg_intro_2, y_oversampled_dum_bg_intro_2, "Intro_2 Bigram with Dummies"),
    (X_oversampled_ug_intro_3, y_oversampled_ug_intro_3, "Intro_3 Unigram"),
    (X_oversampled_bg_intro_3, y_oversampled_bg_intro_3, "Intro_3 Bigram"),
    (X_oversampled_dum_ug_intro_3, y_oversampled_dum_ug_intro_3, "Intro_3 Unigram with Dummies"),
    (X_oversampled_dum_bg_intro_3, y_oversampled_dum_bg_intro_3, "Intro_3 Bigram with Dummies")
]

# Iterate through each dataset and plot
for X_data, y_data, label in datasets:
    # Plot learning curve
    title = f"Learning Curve (Logistic Regression, {label})"
    plot_learning_curve(logreg_balanced, title, X_data, y_data, cv=5)
    plt.show()

    # Validation curve for the C parameter in logistic regression
    param_range = [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000]
    train_scores, test_scores = validation_curve(
        LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga'),
        X_data, y_data, param_name="C", param_range=param_range, cv=5, scoring="accuracy", n_jobs=-1)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.title(f"Validation Curve with Logistic Regression ({label})")
    plt.xlabel("C")
    plt.ylabel("Score")
    plt.ylim(0.5, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.legend(loc="best")
    plt.show()

NameError: name 'X_oversampled_ug' is not defined

# Old version

In [None]:
from sklearn.metrics import make_scorer, precision_score

def process_column_both_grams(column_name, data):
    print(f"Processing column {column_name} without dummies...")
    # Step 1: Data Preprocessing
    print("Step 1: Data Preprocessing...")
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']

    results = {}  # Dictionary to store results

    for ngram, label in [((1, 1), "ug"), ((1, 2), "bg")]:
        print(f"Processing for ngram {ngram}...")
        # Step 2: Vectorization
        print("Step 2: Vectorization...")
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram)
        X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])

        # Step 3: Train-Test Split
        print("Step 3: Train-Test Split...")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Step 4: Model Building & Regularization
        print("Step 4: Model Building & Regularization")
        logreg = LogisticRegression(max_iter=1000)
        logreg.fit(X_train, y_train)

        # Step 5: Basic Model Validation
        print("Step 5: Basic Model Validation")
        y_pred = logreg.predict(X_test)
        basic_accuracy = accuracy_score(y_test, y_pred)

        # Step 6: Cross-Validation
        print("Step 6: Cross-Validation")
        cv_scores = cross_val_score(logreg, X, y, cv=5, scoring='accuracy')
        avg_cv_accuracy = cv_scores.mean()

        # Step 7: Addressing Class Imbalance & Feature Engineering
        print("Step 7: Addressing Class Imbalance & Feature Engineering")
        data_female = data_cleaned[data_cleaned['Article_Gender'] == 'Female']
        data_male = data_cleaned[data_cleaned['Article_Gender'] == 'Male']
        data_female_oversampled = resample(data_female, replace=True, n_samples=len(data_male), random_state=42)
        data_oversampled = pd.concat([data_male, data_female_oversampled])
        X_oversampled = tfidf_vectorizer.transform(data_oversampled[column_name])
        y_oversampled = data_oversampled['Article_Gender']
        X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=42)

        # Step 8: Complex Models & Cost-sensitive Learning
        print("Step 8: Complex Models & Cost-sensitive Learning")
        logreg_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
        logreg_balanced.fit(X_train_os, y_train_os)
        y_pred_os = logreg_balanced.predict(X_test_os)
        accuracy_os = accuracy_score(y_test_os, y_pred_os)

        # Step 9: Performance Evaluation
        print("Step 9: Performance Evaluation")
        evaluation_metrics_logreg = classification_report(y_test_os, y_pred_os, target_names=["Female", "Male"])

        # Step 10: Hyperparameter tuning for Logistic Regression
        print("Step 10: Hyperparameter tuning for Logistic Regression")
        logreg_params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2']
        }
        grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga'),
                                          logreg_params, 
                                          cv=5, 
                                          scoring='accuracy', 
                                          n_jobs=-1)
        grid_search_logreg.fit(X_train_os, y_train_os)
        best_params_logreg = grid_search_logreg.best_params_
        best_score_logreg = grid_search_logreg.best_score_

        # Step 11: Hyperparameter tuning optimized for predicting 'Female'
        print("Step 11: Hyperparameter tuning optimized for predicting 'Female'")
        custom_scorer = make_scorer(precision_score, pos_label='Female', zero_division=0)
        model = LogisticRegression()
        param_grid_custom = {
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2'],
            'max_iter': [50, 400, 800, 1200]
        }
        grid_search_custom = GridSearchCV(model, param_grid_custom, scoring=custom_scorer, cv=5)
        grid_search_custom.fit(X, y)
        best_params_custom = grid_search_custom.best_params_
        best_score_custom = grid_search_custom.best_score_

        results[label] = {
            f'basic_accuracy_{label}': basic_accuracy,
            f'avg_cv_accuracy_{label}': avg_cv_accuracy,
            f'accuracy_os_{label}': accuracy_os,
            f'evaluation_metrics_logreg_{label}': evaluation_metrics_logreg,
            f'best_params_logreg_{label}': best_params_logreg,
            f'best_score_logreg_{label}': best_score_logreg,
            f'best_params_custom_{label}': best_params_custom,
            f'best_score_custom_{label}': best_score_custom
        }

    print(f"Finished processing column {column_name} without dummies.")
    return results

def process_column_both_grams_with_dummies(column_name, data):
    print(f"Processing column {column_name} with dummies...")
    # Step 1: Data Preprocessing
    print("Step 1: Data Preprocessing...")
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']
    journal_dummies = pd.get_dummies(data_cleaned['Journal_name'], prefix='Journal')
    results = {}

    for ngram, label in [((1, 1), "ug"), ((1, 2), "bg")]:
        print(f"Processing for ngram {ngram}...")
        # Step 2: Vectorization
        print("Step 2: Vectorization...")
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram)
        X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])
        X_combined = hstack([X, journal_dummies.values])

        # Step 3: Train-Test Split
        print("Step 3: Train-Test Split...")
        X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

        # Step 4: Model Building & Regularization
        print("Step 4: Model Building & Regularization...")
        logreg = LogisticRegression(max_iter=1000)
        logreg.fit(X_train, y_train)

        # Step 5: Basic Model Validation
        print("Step 5: Basic Model Validation...")
        y_pred = logreg.predict(X_test)
        basic_accuracy = accuracy_score(y_test, y_pred)

        # Step 6: Cross-Validation
        print("Step 6: Cross-Validation...")
        cv_scores = cross_val_score(logreg, X_combined, y, cv=5, scoring='accuracy')
        avg_cv_accuracy = cv_scores.mean()

        # Step 7: Addressing Class Imbalance & Feature Engineering
        print("Step 7: Addressing Class Imbalance & Feature Engineering...")
        data_female = data_cleaned[data_cleaned['Article_Gender'] == 'Female']
        data_male = data_cleaned[data_cleaned['Article_Gender'] == 'Male']
        data_female_oversampled = resample(data_female, replace=True, n_samples=len(data_male), random_state=42)
        data_oversampled = pd.concat([data_male, data_female_oversampled])
        X_oversampled = tfidf_vectorizer.transform(data_oversampled[column_name])
        X_combined_oversampled = hstack([X_oversampled, journal_dummies.loc[data_oversampled.index].values])
        y_oversampled = data_oversampled['Article_Gender']
        X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_combined_oversampled, y_oversampled, test_size=0.2, random_state=42)

        # Step 8: Complex Models & Cost-sensitive Learning
        print("Step 8: Complex Models & Cost-sensitive Learning...")
        logreg_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
        logreg_balanced.fit(X_train_os, y_train_os)
        y_pred_os = logreg_balanced.predict(X_test_os)
        accuracy_os = accuracy_score(y_test_os, y_pred_os)

        # Step 9: Performance Evaluation
        print("Step 9: Performance Evaluation...")
        evaluation_metrics_logreg = classification_report(y_test_os, y_pred_os, target_names=["Female", "Male"])

        # Step 10: Hyperparameter tuning for Logistic Regression
        print("Step 10: Hyperparameter tuning for Logistic Regression...")
        logreg_params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2']
        }
        grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga'),
                                          logreg_params, 
                                          cv=5, 
                                          scoring='accuracy', 
                                          n_jobs=-1)
        grid_search_logreg.fit(X_train_os, y_train_os)
        best_params_logreg = grid_search_logreg.best_params_
        best_score_logreg = grid_search_logreg.best_score_

        # Step 11: Hyperparameter tuning optimized for predicting 'Female'
        print("Step 11: Hyperparameter tuning optimized for predicting 'Female'...")
        custom_scorer = make_scorer(precision_score, pos_label='Female', zero_division=0)
        model = LogisticRegression()
        param_grid_custom = {
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2'],
            'max_iter': [50, 400, 800, 1200]
        }
        grid_search_custom = GridSearchCV(model, param_grid_custom, scoring=custom_scorer, cv=5)
        grid_search_custom.fit(X_combined, y)
        best_params_custom = grid_search_custom.best_params_
        best_score_custom = grid_search_custom.best_score_

        results[label] = {
            f'basic_accuracy_{label}': basic_accuracy,
            f'avg_cv_accuracy_{label}': avg_cv_accuracy,
            f'accuracy_os_{label}': accuracy_os,
            f'evaluation_metrics_logreg_{label}': evaluation_metrics_logreg,
            f'best_params_logreg_{label}': best_params_logreg,
            f'best_score_logreg_{label}': best_score_logreg,
            f'best_params_custom_{label}': best_params_custom,
            f'best_score_custom_{label}': best_score_custom
        }

    print(f"Finished processing column {column_name} without dummies.")
    return results

# Assuming you've loaded your data into a DataFrame named 'data':

# Call the functions for each of the desired columns without dummies
results_intro_cleaned = process_column_both_grams('Intro_Cleaned', data)
results_intro_1 = process_column_both_grams('Intro_1', data)
results_intro_2 = process_column_both_grams('Intro_2', data)
results_intro_3 = process_column_both_grams('Intro_3', data)

print("All processing completed without dummies.")

results_intro_cleaned, results_intro_1, results_intro_2, results_intro_3

# Call the functions for each of the desired columns with dummies
results_intro_cleaned_dummies = process_column_both_grams_with_dummies('Intro_Cleaned', data)
results_intro_1_dummies = process_column_both_grams_with_dummies('Intro_1', data)
results_intro_2_dummies = process_column_both_grams_with_dummies('Intro_2', data)
results_intro_3_dummies = process_column_both_grams_with_dummies('Intro_3', data)

print("All processing completed with dummies.")

results_intro_cleaned_dummies, results_intro_1_dummies, results_intro_2_dummies, results_intro_3_dummies

# RESULTS

In [17]:
def print_results(column_name, results_ug, results_bg, results_ug_dummies, results_bg_dummies):
    # Header for the column
    print(f"\nResults for {column_name}:")
    print("=" * 60)
    
    # Iterate through each variant (ug, bg, ug_dummies, bg_dummies)
    for variant, metrics in {"ug": results_ug, "bg": results_bg, "ug_dummies": results_ug_dummies, "bg_dummies": results_bg_dummies}.items():
        if "_dummies" in variant:
            model_type = "with Journal Dummies"
        else:
            model_type = "without Journal Dummies"
        
        ngram_type = "Unigram" if "ug" in variant else "Bigram"
        
        # Basic Information
        print(f"\nModel Performance Summary ({ngram_type}, {model_type}):")
        print("-" * 60)
        print(f"Basic Logistic Regression Accuracy: {metrics[f'basic_accuracy_{variant}'] * 100:.2f}%")
        print(f"Cross-Validation Accuracy: {metrics[f'avg_cv_accuracy_{variant}'] * 100:.2f}%")
        print(f"Oversampled Logistic Regression Accuracy: {metrics[f'accuracy_os_{variant}'] * 100:.2f}%")
        print(f"Best Hyperparameter Tuned Model Accuracy: {metrics[f'best_score_logreg_{variant}'] * 100:.2f}%")

    print("\n")
    
# Call the print_results function for each set of results
print_results('Intro_Cleaned', results_intro_cleaned['ug'], results_intro_cleaned['bg'], results_intro_cleaned_dummies['ug'], results_intro_cleaned_dummies['bg'])
print_results('intro_1', results_intro_1['ug'], results_intro_1['bg'], results_intro_1_dummies['ug'], results_intro_1_dummies['bg'])
print_results('Intro_2', results_intro_2['ug'], results_intro_2['bg'], results_intro_2_dummies['ug'], results_intro_2_dummies['bg'])
print_results('Intro_3', results_intro_3['ug'], results_intro_3['bg'], results_intro_3_dummies['ug'], results_intro_3_dummies['bg'])

NameError: name 'results_intro_cleaned_dummies' is not defined

# PLOTS

In [None]:
# Baseline Accuracy Calculation
baseline_accuracy = max(data_cleaned['Article_Gender'].value_counts(normalize=True))

# List of accuracies for each model type
labels = [
    "Unigram", 
    "Bigram", 
    "Unigram with Dummies", 
    "Bigram with Dummies",
    "Baseline"
]

def plot_accuracies(accuracies, title, ylabel):
    plt.figure(figsize=(15, 7))
    plt.bar(labels, accuracies, color=['blue', 'green', 'purple', 'cyan', 'red'], alpha=0.7)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.ylim([0.5, 1.0])
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# 1. Basic Accuracy
plot_accuracies([
    results['basic_accuracy_ug'], 
    results_bg['basic_accuracy_bg'],
    results_dum_ug['basic_accuracy_ug'], 
    results_dum_bg['basic_accuracy_bg'],
    baseline_accuracy
], "Comparison of Basic Model Accuracies", "Basic Accuracy")

# 2. Cross-Validation Accuracy
plot_accuracies([
    results['avg_cv_accuracy_ug'], 
    results_bg['avg_cv_accuracy_bg'],
    results_dum_ug['avg_cv_accuracy_ug'], 
    results_dum_bg['avg_cv_accuracy_bg'],
    baseline_accuracy
], "Comparison of CV Model Accuracies", "Cross-Validation Accuracy")

# 3. Oversampled Accuracy
plot_accuracies([
    results['accuracy_os_ug'], 
    results_bg['accuracy_os_bg'],
    results_dum_ug['accuracy_os_ug'], 
    results_dum_bg['accuracy_os_bg'],
    baseline_accuracy
], "Comparison of Oversampled Model Accuracies", "Oversampled Accuracy")

# 4. Best Hyperparameter Tuned Accuracy
plot_accuracies([
    results['best_score_logreg'], 
    results_bg['best_score_logreg_bg'],
    results_dum_ug['best_score_logreg'], 
    results_dum_bg['best_score_logreg_bg'],
    baseline_accuracy
], "Comparison of Best Hyperparameter Tuned Model Accuracies", "Best Tuned Accuracy")

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, make_scorer, precision_score
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
import pandas as pd

def process_column_both_grams(column_name, data):
    print(f"Processing column {column_name} without dummies...")
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']

    results = {}

    for ngram, label in [((1, 1), "ug"), ((1, 2), "bg")]:
        print(f"Processing for ngram {ngram}...")
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram)
        X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Oversampling to address class imbalance
        data_female = data_cleaned[data_cleaned['Article_Gender'] == 'Female']
        data_male = data_cleaned[data_cleaned['Article_Gender'] == 'Male']
        data_female_oversampled = resample(data_female, replace=True, n_samples=len(data_male), random_state=42)
        data_oversampled = pd.concat([data_male, data_female_oversampled])
        X_oversampled = tfidf_vectorizer.transform(data_oversampled[column_name])
        y_oversampled = data_oversampled['Article_Gender']
        X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=42)

        # Step 10: Hyperparameter tuning for Logistic Regression
        print("Step 10: Hyperparameter tuning for Logistic Regression")
        logreg_params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2']
        }
        grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga'),
                                          logreg_params, 
                                          cv=5, 
                                          scoring='accuracy', 
                                          n_jobs=-1)
        grid_search_logreg.fit(X_train_os, y_train_os)
        best_params_logreg = grid_search_logreg.best_params_
        best_score_logreg = grid_search_logreg.best_score_

        # Step 11: Hyperparameter tuning optimized for predicting 'Female'
        print("Step 11: Hyperparameter tuning optimized for predicting 'Female'")
        custom_scorer = make_scorer(precision_score, pos_label='Female', zero_division=0)
        param_grid_custom = {
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2'],
            'max_iter': [50, 400, 800, 1200]
        }
        grid_search_custom = GridSearchCV(LogisticRegression(), param_grid_custom, scoring=custom_scorer, cv=5)
        grid_search_custom.fit(X, y)
        best_params_custom = grid_search_custom.best_params_
        best_score_custom = grid_search_custom.best_score_

        results[label] = {
            f'best_params_logreg_{label}': best_params_logreg,
            f'best_score_logreg_{label}': best_score_logreg,
            f'best_params_custom_{label}': best_params_custom,
            f'best_score_custom_{label}': best_score_custom
        }

    print(f"Finished processing column {column_name} without dummies.")
    return results

def process_column_both_grams_with_dummies(column_name, data):
    print(f"Processing column {column_name} with dummies...")
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']
    journal_dummies = pd.get_dummies(data_cleaned['Journal_name'], prefix='Journal')
    results = {}

    for ngram, label in [((1, 1), "ug"), ((1, 2), "bg")]:
        print(f"Processing for ngram {ngram}...")
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram)
        X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])
        X_combined = hstack([X, journal_dummies.values])
        X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

        # Oversampling to address class imbalance
        data_female = data_cleaned[data_cleaned['Article_Gender'] == 'Female']
        data_male = data_cleaned[data_cleaned['Article_Gender'] == 'Male']
        data_female_oversampled = resample(data_female, replace=True, n_samples=len(data_male), random_state=42)
        data_oversampled = pd.concat([data_male, data_female_oversampled])
        X_oversampled = tfidf_vectorizer.transform(data_oversampled[column_name])
        X_combined_oversampled = hstack([X_oversampled, journal_dummies.loc[data_oversampled.index].values])
        y_oversampled = data_oversampled['Article_Gender']
        X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_combined_oversampled, y_oversampled, test_size=0.2, random_state=42)

        # Step 10: Hyperparameter tuning for Logistic Regression
        print("Step 10: Hyperparameter tuning for Logistic Regression...")
        logreg_params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2']
        }
        grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga'),
                                          logreg_params, 
                                          cv=5, 
                                          scoring='accuracy', 
                                          n_jobs=-1)
        grid_search_logreg.fit(X_train_os, y_train_os)
        best_params_logreg = grid_search_logreg.best_params_
        best_score_logreg = grid_search_logreg.best_score_

        # Step 11: Hyperparameter tuning optimized for predicting 'Female'
        print("Step 11: Hyperparameter tuning optimized for predicting 'Female'...")
        custom_scorer = make_scorer(precision_score, pos_label='Female', zero_division=0)
        param_grid_custom = {
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500, 1000],
            'penalty': ['l1', 'l2'],
            'max_iter': [50, 400, 800, 1200]
        }
        grid_search_custom = GridSearchCV(LogisticRegression(), param_grid_custom, scoring=custom_scorer, cv=5)
        grid_search_custom.fit(X_combined, y)
        best_params_custom = grid_search_custom.best_params_
        best_score_custom = grid_search_custom.best_score_

        results[label] = {
            f'best_params_logreg_{label}': best_params_logreg,
            f'best_score_logreg_{label}': best_score_logreg,
            f'best_params_custom_{label}': best_params_custom,
            f'best_score_custom_{label}': best_score_custom
        }

    print(f"Finished processing column {column_name} with dummies.")
    return results


# Learning curve & Validation curve - ikke lykkes

In [None]:
import numpy as np
from sklearn.model_selection import learning_curve, validation_curve
import matplotlib.pyplot as plt

# List to hold datasets and corresponding labels
datasets = [
    (X_oversampled_ug, y_oversampled_ug, "Unigram"),
    (X_oversampled_bg, y_oversampled_bg, "Bigram"),
    (X_oversampled_dum_ug, y_oversampled_dum_ug, "Unigram with Dummies"),
    (X_oversampled_dum_bg, y_oversampled_dum_bg, "Bigram with Dummies")
]

# Iterate through each dataset and plot
for X_data, y_data, label in datasets:
    # Plot learning curve
    title = f"Learning Curve (Logistic Regression, {label})"
    plot_learning_curve(logreg_balanced, title, X_data, y_data, cv=5)
    plt.show()

    # Validation curve for the C parameter in logistic regression
    param_range = [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000]
    train_scores, test_scores = validation_curve(
        LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga'),
        X_data, y_data, param_name="C", param_range=param_range, cv=5, scoring="accuracy", n_jobs=-1)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.title(f"Validation Curve with Logistic Regression ({label})")
    plt.xlabel("C")
    plt.ylabel("Score")
    plt.ylim(0.5, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.legend(loc="best")
    plt.show()