In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from scipy.sparse import hstack

# Load the dataset
data = pd.read_csv('./STTs.csv')

# Hypothetical target: Predicting the number of words in a sentence
data['num_words'] = data['name'].apply(lambda x: len(x.split()))

# Features and target variable
X = data['name']
y = data['num_words']

# Convert text data to numerical data using TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)
# print(X_tfidf)

# One-Hot Encoding for another categorical feature if it exists
# Assuming there is another categorical feature 'category'
if 'name' in data.columns:
    encoder = OneHotEncoder()
    X_encoded = encoder.fit_transform(data[['name']])
    X_tfidf = hstack([X_tfidf, X_encoded])
# print("NEW")
# print(X_tfidf)
# 
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


  (0, 81)	0.39672746902696165
  (0, 8)	0.3090343756632427
  (0, 78)	0.4386920380922144
  (0, 108)	0.32498856437321416
  (0, 64)	0.4386920380922144
  (0, 167)	0.4386920380922144
  (0, 163)	0.2532496597194666
  (1, 44)	0.3749314446352231
  (1, 117)	0.41459049960744265
  (1, 158)	0.8291809992148853
  (2, 62)	0.5
  (2, 56)	0.5
  (2, 142)	0.5
  (2, 116)	0.5
  (3, 39)	0.38279299166704817
  (3, 177)	0.38279299166704817
  (3, 166)	0.2575974214408444
  (3, 170)	0.38279299166704817
  (3, 110)	0.38279299166704817
  (3, 66)	0.34617563474768254
  (3, 70)	0.2575974214408444
  (3, 71)	0.4016561576316989
  (4, 130)	0.4836369221971747
  (4, 59)	0.4836369221971747
  (4, 176)	0.2537347235807036
  :	:
  (45, 114)	0.43407319094842456
  (45, 118)	0.40675411633349645
  (46, 93)	0.45564754717149986
  (46, 113)	0.45564754717149986
  (46, 14)	0.4120610415995156
  (46, 120)	0.4120610415995156
  (46, 166)	0.30662430031977966
  (46, 70)	0.30662430031977966
  (46, 71)	0.23905040977134376
  (47, 80)	0.49998400308122