In [None]:
import logging
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from typing import Tuple, Any, Dict

# Set up logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# handler
if not logger.handlers:
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

def load_and_clean_dataset(file_path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(file_path, header=0, names=['text', 'gender'])
        df = df.dropna(subset=['text', 'gender'])
        df['gender'] = df['gender'].astype(str).str.strip().str.upper()
        df = df[df['gender'].isin(['M', 'F'])]
        df['label'] = df['gender'].map({'M': 0, 'F': 1})
        logging.info(f"Dataset loaded and cleaned. Total instances: {df.shape[0]}")
        return df
    except Exception as e:
        logging.error("Error loading or cleaning the dataset", exc_info=True)
        raise e

def split_dataset(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42) -> Tuple:
    return train_test_split(df, test_size=test_size, stratify=df['label'], random_state=random_state)

def build_vectorizers(max_features: int = 5000) -> Dict[str, Any]:
    token_pattern = r"(?u)\b[a-zA-Z]+\b"
    bow_vectorizer = CountVectorizer(max_features=max_features, token_pattern=token_pattern)
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, token_pattern=token_pattern)
    return {"bow": bow_vectorizer, "tfidf": tfidf_vectorizer}

def vectorize_data(vectorizer: Any, texts_train: pd.Series, texts_test: pd.Series) -> Tuple:
    X_train = vectorizer.fit_transform(texts_train)
    X_test = vectorizer.transform(texts_test)
    return X_train, X_test

def evaluate_model(model, X_test, y_test, model_name: str) -> None:
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['M', 'F'])
    logging.info(f"{model_name} Accuracy: {acc:.4f}")
    logging.info(f"{model_name} Classification Report:\n{report}")
    sys.stdout.flush()  # Ensure output is flushed

def main():
    file_path = '/data/raw/gender-classification.csv'
    df = load_and_clean_dataset(file_path)

    df_train, df_test = split_dataset(df)
    y_train = df_train['label']
    y_test = df_test['label']

    vectorizers = build_vectorizers(max_features=5000)

    # BoW Vectorization
    X_train_bow, X_test_bow = vectorize_data(vectorizers["bow"], df_train['text'], df_test['text'])
    vocab_bow = vectorizers["bow"].get_feature_names_out()
    logging.info(f"BoW vocabulary size: {len(vocab_bow)} tokens")
    logging.info(f"Sample BoW features (first 20 tokens): {vocab_bow[:20]}")
    logging.info(f"Sample BoW vector (first train instance): {X_train_bow[0].toarray()}")

    # TF-IDF Vectorization
    X_train_tfidf, X_test_tfidf = vectorize_data(vectorizers["tfidf"], df_train['text'], df_test['text'])
    vocab_tfidf = vectorizers["tfidf"].get_feature_names_out()
    logging.info(f"TF-IDF vocabulary size: {len(vocab_tfidf)} tokens")
    logging.info(f"Sample TF-IDF features (first 20 tokens): {vocab_tfidf[:20]}")
    logging.info(f"Sample TF-IDF vector (first train instance): {X_train_tfidf[0].toarray()}")

    # Models on TF-IDF features
    models_tfidf = {
        "Logistic Regression (TF-IDF)": LogisticRegression(max_iter=1000),
        "Random Forest (TF-IDF)": RandomForestClassifier(n_estimators=100, random_state=42),
        "Linear SVM (TF-IDF)": LinearSVC(max_iter=10000),
        "Gradient Boosting (TF-IDF)": GradientBoostingClassifier(random_state=42)
    }
    for name, model in models_tfidf.items():
        logging.info(f"Training {name}...")
        model.fit(X_train_tfidf, y_train)
        evaluate_model(model, X_test_tfidf, y_test, name)

    # Models on BoW features
    models_bow = {
        "Logistic Regression (BoW)": LogisticRegression(max_iter=1000),
        "Random Forest (BoW)": RandomForestClassifier(n_estimators=100, random_state=42),
        "Multinomial Naive Bayes (BoW)": MultinomialNB()
    }
    for name, model in models_bow.items():
        logging.info(f"Training {name}...")
        model.fit(X_train_bow, y_train)
        evaluate_model(model, X_test_bow, y_test, name)

if __name__ == "__main__":
    main()

INFO:root:Dataset loaded and cleaned. Total instances: 3226
INFO:root:BoW vocabulary size: 5000 tokens
INFO:root:Sample BoW features (first 20 tokens): ['a' 'ability' 'able' 'about' 'above' 'abroad' 'absolute' 'absolutely'
 'abt' 'abuse' 'academic' 'academy' 'accept' 'acceptance' 'accepted'
 'accepting' 'access' 'accessories' 'accident' 'accommodate']
INFO:root:Sample BoW vector (first train instance): [[8 0 0 ... 0 0 0]]
INFO:root:TF-IDF vocabulary size: 5000 tokens
INFO:root:Sample TF-IDF features (first 20 tokens): ['a' 'ability' 'able' 'about' 'above' 'abroad' 'absolute' 'absolutely'
 'abt' 'abuse' 'academic' 'academy' 'accept' 'acceptance' 'accepted'
 'accepting' 'access' 'accessories' 'accident' 'accommodate']
INFO:root:Sample TF-IDF vector (first train instance): [[0.15224339 0.         0.         ... 0.         0.         0.        ]]
INFO:root:Training Logistic Regression (TF-IDF)...
INFO:root:Logistic Regression (TF-IDF) Accuracy: 0.7121
INFO:root:Logistic Regression (TF-IDF)