In [6]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFE

# Hypothetical example: assume 'STTs.csv' has a binary target 'label'
data = pd.read_csv('./data.csv')
data['num_words'] = data['name'].apply(lambda x: len(x.split()))

# Features and target variable
X = data['name']
y = data['n_label']  # Assuming the label column exists and is binary

# Convert text data to numerical data using TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Initialize the logistic regression model
log_reg = LogisticRegression(random_state=42)

# Feature selection using RFE
def feature_selection_rfe(X, y, n_features_to_select=10):
    selector = RFE(log_reg, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return selector.support_

# Function to perform manual cross-validation with feature selection and print results
def manual_cross_validation_with_fs(X, y, test_size, n_splits=5, n_features_to_select=10):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    for i in range(n_splits):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42 + i)
        
        # Feature selection on training data
        selected_features = feature_selection_rfe(X_train, y_train, n_features_to_select)
        X_train_selected = X_train[:, selected_features]
        X_test_selected = X_test[:, selected_features]
        
        log_reg.fit(X_train_selected, y_train)
        y_pred = log_reg.predict(X_test_selected)
        
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
    
    mean_accuracy = np.mean(accuracy_scores)
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    mean_f1 = np.mean(f1_scores)
    
    return accuracy_scores, mean_accuracy, precision_scores, mean_precision, recall_scores, mean_recall, f1_scores, mean_f1

# Perform manual cross-validation for different ratios with feature selection
ratios = [0.2, 0.3, 0.4]  # Corresponding to 80-20, 70-30, and 60-40 splits
n_features_to_select = 2  # Number of features to select
for ratio in ratios:
    accuracy_scores, mean_accuracy, precision_scores, mean_precision, recall_scores, mean_recall, f1_scores, mean_f1 = manual_cross_validation_with_fs(X_tfidf, y, test_size=ratio, n_features_to_select=n_features_to_select)
    
    print(f"Cross-Validation Accuracy Scores for {int((1-ratio)*100)}-{int(ratio*100)} Split with Feature Selection: {accuracy_scores}")
    print(f"Mean Accuracy Score: {mean_accuracy}")
    print(f"Cross-Validation Precision Scores for {int((1-ratio)*100)}-{int(ratio*100)} Split with Feature Selection: {precision_scores}")
    print(f"Mean Precision Score: {mean_precision}")
    print(f"Cross-Validation Recall Scores for {int((1-ratio)*100)}-{int(ratio*100)} Split with Feature Selection: {recall_scores}")
    print(f"Mean Recall Score: {mean_recall}")
    print(f"Cross-Validation F1 Scores for {int((1-ratio)*100)}-{int(ratio*100)} Split with Feature Selection: {f1_scores}")
    print(f"Mean F1 Score: {mean_f1}\n")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-Validation Accuracy Scores for 80-20 Split with Feature Selection: [0.0, 0.6666666666666666, 0.3333333333333333, 0.0, 0.3333333333333333]
Mean Accuracy Score: 0.26666666666666666
Cross-Validation Precision Scores for 80-20 Split with Feature Selection: [0.0, 0.0, 0.3333333333333333, 0.0, 0.0]
Mean Precision Score: 0.06666666666666667
Cross-Validation Recall Scores for 80-20 Split with Feature Selection: [0.0, 0.0, 1.0, 0.0, 0.0]
Mean Recall Score: 0.2
Cross-Validation F1 Scores for 80-20 Split with Feature Selection: [0.0, 0.0, 0.5, 0.0, 0.0]
Mean F1 Score: 0.1



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-Validation Accuracy Scores for 70-30 Split with Feature Selection: [0.25, 0.25, 0.25, 0.25, 0.5]
Mean Accuracy Score: 0.3
Cross-Validation Precision Scores for 70-30 Split with Feature Selection: [0.0, 0.25, 0.25, 0.25, 0.0]
Mean Precision Score: 0.15
Cross-Validation Recall Scores for 70-30 Split with Feature Selection: [0.0, 1.0, 1.0, 1.0, 0.0]
Mean Recall Score: 0.6
Cross-Validation F1 Scores for 70-30 Split with Feature Selection: [0.0, 0.4, 0.4, 0.4, 0.0]
Mean F1 Score: 0.24000000000000005



  _warn_prf(average, modifier, msg_start, len(result))


Cross-Validation Accuracy Scores for 60-40 Split with Feature Selection: [0.4, 0.6, 0.4, 0.4, 0.2]
Mean Accuracy Score: 0.39999999999999997
Cross-Validation Precision Scores for 60-40 Split with Feature Selection: [0.0, 0.0, 0.4, 0.4, 0.25]
Mean Precision Score: 0.21000000000000002
Cross-Validation Recall Scores for 60-40 Split with Feature Selection: [0.0, 0.0, 1.0, 1.0, 0.5]
Mean Recall Score: 0.5
Cross-Validation F1 Scores for 60-40 Split with Feature Selection: [0.0, 0.0, 0.5714285714285715, 0.5714285714285715, 0.3333333333333333]
Mean F1 Score: 0.29523809523809524

