In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv('./STTs.csv')

# Hypothetical target: Predicting the number of words in a sentence
data['num_words'] = data['name'].apply(lambda x: len(x.split()))

# Multivariate features: Length of the name, number of words, and a one-hot encoded categorical feature
data['name_length'] = data['name'].apply(len)

# One-Hot Encoding for the categorical feature 'category'
if 'name' in data.columns:
    encoder = OneHotEncoder()
    X_encoded = encoder.fit_transform(data[['name']])
    X_encoded_df = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names_out(['name']))
    data = pd.concat([data, X_encoded_df], axis=1)

# Features and target variable
feature_columns = ['name_length', 'num_words'] + list(X_encoded_df.columns) if 'category' in data.columns else ['name_length', 'num_words']
X_multi = data[feature_columns]
y = data['num_words']

# Split the dataset
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y, test_size=0.2, random_state=42)

# Train the model
model_multi = LinearRegression()
model_multi.fit(X_train_multi, y_train_multi)

# Predictions
y_pred_multi = model_multi.predict(X_test_multi)

# Evaluation
print("Multivariate Linear Regression")
print("Mean Squared Error:", mean_squared_error(y_test_multi, y_pred_multi))
print("R^2 Score:", r2_score(y_test_multi, y_pred_multi))
print()


Multivariate Linear Regression
Mean Squared Error: 9.466330862652141e-31
R^2 Score: 1.0

