In [None]:
from datasets import load_dataset
dataset = load_dataset("open-thoughts/OpenThoughts-114k", "metadata", split="train[:10%]")
problems = dataset['problem']
Reasoning = dataset['deepseek_reasoning']

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")

In [None]:
from transformers import AutoTokenizer
import torch
def compute_token_lengths(texts, tokenizer):
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=16384)
    r_l = torch.sum(encodings['attention_mask'],dim=1)
    return r_l

reasoning_l = compute_token_lengths(Reasoning,tokenizer)

In [None]:
print(reasoning_l.shape)
model = SentenceTransformer("sentence-transformers/LaBSE",device='cuda')

In [None]:
from sentence_transformers import SentenceTransformer
from pl_tools.tools import compress
# model = SentenceTransformer("sentence-transformers/LaBSE",device='cuda')
problems_embeddings = model.encode(problems)
problems_embeddings = torch.tensor(problems_embeddings)
compressed_eb = compress(problems_embeddings,d=16)

In [None]:

import json

compressed_eb_list = compressed_eb.tolist()
r_l_list = reasoning_l.tolist()
data = [
    {
        "problem": problem,
        "embedding": embedding,
        "reasoning_length": r_l,
        "problem_length":len(problem)
    }
    for problem, embedding, r_l in zip(problems, compressed_eb_list, r_l_list)
]

with open("problems_with_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

In [None]:

import numpy as np
problem_length = [len(problem) for problem in problems]
X_length = np.array(problem_length).reshape(-1, 1)

In [None]:

import torch
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib  

X_embed = compressed_eb.cpu().numpy() if isinstance(compressed_eb, torch.Tensor) else compressed_eb
X_length = np.array(problem_length).reshape(-1, 1)
y = np.array(reasoning_l)

X = np.concatenate([X_length, X_embed], axis=1)  # Shape: [, 17]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=32)

regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_leaf=5,
    min_samples_split=10,
    max_features='sqrt',
    random_state=42
)
regressor.fit(X_train, y_train)

val_preds = regressor.predict(X_val)
val_mse = np.mean((val_preds - y_val) ** 2)
print(f"valid MSE：{val_mse:.2f}")

joblib.dump(regressor, "random_forest_regressor.pkl")
print("the model is saved to random_forest_regressor.pkl")

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
train_pred = regressor.predict(X_train)
plt.scatter(y_train, train_pred, alpha=0.5)
plt.grid(True)
plt.show()

In [None]:

import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.scatter(y_val, val_preds, alpha=0.5)
plt.grid(True)
plt.show()

In [None]:

import json
import pandas as pd
with open("problems_with_embeddings.json","r",encoding='utf-8') as file:
    data = json.load(file)

reasoning_lengths = [item['reasoning_length'] for item in data]
df = pd.DataFrame(reasoning_lengths, columns=["reasoning_length"])
df.describe()

In [None]:
quantiles = df["reasoning_length"].quantile([0.2, 0.4, 0.6, 0.8])
print(quantiles)