In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from nltk.corpus import stopwords
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from scipy.sparse import hstack
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import pandas as pd

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('data_cleaned.csv')

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

def process_column_rf(column_name, data, with_dummies=False):
    """Function to process data using RandomForest."""
    # Step 1: Data Preprocessing
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']
    
    # Create journal dummies if required
    if with_dummies:
        journal_dummies = pd.get_dummies(data_cleaned['Journal'], prefix='Journal')

    results = {}  # Dictionary to store results
    
    for ngram, label in [((1, 1), "ug"), ((1, 2), "bg")]:
        # Step 2: Vectorization
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram)
        X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])
        
        # Add dummies if required
        if with_dummies:
            X = hstack([X, journal_dummies.values])

        # Step 3: Train-Test Split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Step 4: Addressing Class Imbalance
        data_female = data_cleaned[data_cleaned['Article_Gender'] == 'Female']
        data_male = data_cleaned[data_cleaned['Article_Gender'] == 'Male']
        data_female_oversampled = resample(data_female, replace=True, n_samples=len(data_male), random_state=42)
        data_oversampled = pd.concat([data_male, data_female_oversampled])
        X_oversampled = tfidf_vectorizer.transform(data_oversampled[column_name])
        y_oversampled = data_oversampled['Article_Gender']
        X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=42)

        # Step 5: Building Random Forest Model with OOB Score
        rf = RandomForestClassifier(class_weight='balanced', n_estimators=100, oob_score=True, random_state=42)
        rf.fit(X_train_os, y_train_os)

        # Step 6: Predictions
        y_pred = rf.predict(X_test_os)
        accuracy = accuracy_score(y_test_os, y_pred)

        # Step 7: Performance Evaluation
        evaluation_metrics_rf = classification_report(y_test_os, y_pred, target_names=["Female", "Male"])
        confusion_rf = confusion_matrix(y_test_os, y_pred)

        # Step 8: Feature Importance
        feature_importance = rf.feature_importances_
        important_features = sorted(zip(feature_importance, tfidf_vectorizer.get_feature_names_out()), reverse=True)[:20]

        # Step 9: Hyperparameter Tuning
        rf_params = {
            'n_estimators': [50, 100, 150],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        grid_search_rf = GridSearchCV(RandomForestClassifier(class_weight='balanced', oob_score=True, random_state=42),
                                      rf_params,
                                      cv=5,
                                      scoring='accuracy',
                                      n_jobs=-1)
        grid_search_rf.fit(X_train_os, y_train_os)
        best_params_rf = grid_search_rf.best_params_
        best_score_rf = grid_search_rf.best_score_

        # Outputs
        results[label + ("_dummies" if with_dummies else "")] = {
            'accuracy': accuracy,
            'evaluation_metrics_rf': evaluation_metrics_rf,
            'confusion_rf': confusion_rf,
            'important_features': important_features,
            'best_params_rf': best_params_rf,
            'best_score_rf': best_score_rf
        }
    
    return results

# Call the functions for each of the desired columns without dummies
results_intro_cleaned = process_column_rf('Intro_Cleaned', data)
results_intro_1 = process_column_rf('Intro_1', data)
results_intro_2 = process_column_rf('Intro_2', data)
results_intro_3 = process_column_rf('Intro_3', data)

# Call the functions for each of the desired columns with dummies
results_intro_cleaned_dummies = process_column_rf('Intro_Cleaned', data, with_dummies=True)
results_intro_1_dummies = process_column_rf('Intro_1', data, with_dummies=True)
results_intro_2_dummies = process_column_rf('Intro_2', data, with_dummies=True)
results_intro_3_dummies = process_column_rf('Intro_3', data, with_dummies=True)

In [None]:
def print_rf_results(column_name, results):
    # Header for the column
    print(f"\nResults for {column_name}:")
    print("=" * 60)
    
    # Iterate through each variant (ug, bg, ug_dummies, bg_dummies)
    for variant, metrics in results.items():
        if "_dummies" in variant:
            model_type = "with Journal Dummies"
        else:
            model_type = "without Journal Dummies"
        
        ngram_type = "Unigram" if "ug" in variant else "Bigram"
        
        # Basic Information
        print(f"\nModel Performance Summary (Random Forest, {ngram_type}, {model_type}):")
        print("-" * 60)
        print(f"Random Forest Accuracy: {metrics['accuracy'] * 100:.2f}%")
        print(f"Best Score with Optimal Hyperparameters: {metrics['best_score_rf'] * 100:.2f}%")
        print(f"Best Hyperparameters: {metrics['best_params_rf']}")
        print(f"Top 5 Important Features: {metrics['important_features'][:5]}")
        print(f"\nClassification Report:\n{metrics['evaluation_metrics_rf']}")
        print(f"Confusion Matrix:\n{metrics['confusion_rf']}")
        
    print("\n")
    
# Call the print_rf_results function for each set of results
print_rf_results('Intro_Cleaned', results_intro_cleaned)
print_rf_results('Intro_1', results_intro_1)
print_rf_results('Intro_2', results_intro_2)
print_rf_results('Intro_3', results_intro_3)
print_rf_results('Intro_Cleaned with Dummies', results_intro_cleaned_dummies)
print_rf_results('Intro_1 with Dummies', results_intro_1_dummies)
print_rf_results('Intro_2 with Dummies', results_intro_2_dummies)
print_rf_results('Intro_3 with Dummies', results_intro_3_dummies)