In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE

# Load the dataset
data = pd.read_csv('./STTs.csv')

# Hypothetical target: Predicting the number of words in a sentence
data['num_words'] = data['name'].apply(lambda x: len(x.split()))

# Features and target variable
X = data['name']
y = data['num_words']

# Convert text data to numerical data using TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Initialize the linear regression model
lr = LinearRegression()

# Feature selection using RFE
def feature_selection_rfe(X, y, n_features_to_select=10):
    selector = RFE(lr, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return selector.support_

# Function to perform manual cross-validation with feature selection and print results
def manual_cross_validation_with_fs(X, y, test_size, n_splits=5, n_features_to_select=10):
    scores = []
    r2_scores = []
    for i in range(n_splits):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42 + i)
        
        # Feature selection on training data
        selected_features = feature_selection_rfe(X_train, y_train, n_features_to_select)
        X_train_selected = X_train[:, selected_features]
        X_test_selected = X_test[:, selected_features]
        
        lr.fit(X_train_selected, y_train)
        y_pred = lr.predict(X_test_selected)
        
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        scores.append(mse)
        r2_scores.append(r2)
    
    mean_mse = np.mean(scores)
    mean_r2 = np.mean(r2_scores)
    return scores, mean_mse, r2_scores, mean_r2

# Perform manual cross-validation for different ratios with feature selection
ratios = [0.2, 0.3, 0.4]  # Corresponding to 80-20, 70-30, and 60-40 splits
n_features_to_select = 10  # Number of features to select
for ratio in ratios:
    scores, mean_mse, r2_scores, mean_r2 = manual_cross_validation_with_fs(X_tfidf, y, test_size=ratio, n_features_to_select=n_features_to_select)
    print(f"Cross-Validation MSE Scores for {int((1-ratio)*100)}-{int(ratio*100)} Split with Feature Selection: {scores}")
    print(f"Mean MSE Score: {mean_mse}")
    print(f"Cross-Validation R^2 Scores for {int((1-ratio)*100)}-{int(ratio*100)} Split with Feature Selection: {r2_scores}")
    print(f"Mean R^2 Score: {mean_r2}\n")


Cross-Validation MSE Scores for 80-20 Split with Feature Selection: [2.543121561717309, 3.2463843643402703, 4.653185486957414, 1.0902886513171965, 1.855277827986706]
Mean MSE Score: 2.677651578463779
Cross-Validation R^2 Scores for 80-20 Split with Feature Selection: [-0.021333960529039686, -0.9795026611830919, -3.847068215580639, 0.38051781175159294, 0.171750969648792]
Mean R^2 Score: -0.859127211178477

Cross-Validation MSE Scores for 70-30 Split with Feature Selection: [2.481156907259591, 2.3545479703418772, 2.650207449587659, 2.7795636813523674, 1.8748019685754242]
Mean MSE Score: 2.428055595423384
Cross-Validation R^2 Scores for 70-30 Split with Feature Selection: [-0.02245476947510605, -0.3941402455971641, -0.18312832570877635, -0.0022465197184016183, 0.2130029049823311]
Mean R^2 Score: -0.0777933911034234

Cross-Validation MSE Scores for 60-40 Split with Feature Selection: [1.3154311670631667, 2.1615360934906285, 2.0902102792862047, 2.7857615880027073, 1.741409553410783]
Mean MS