In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

def read_binarry(file_path):
    import struct
    import math

    power = []
    age = []
    coordinate_x = []
    coordinate_y = []
    angle_tetta = []
    angle_phi = []
    energy = []
    time = []

    with open(file_path, 'rb') as binary_file:
        for i in range(100000):
            binary_file.read(4 * 5)

            tetta = struct.unpack('f', binary_file.read(4))[0]
            angle_tetta.append(tetta)

            phi = struct.unpack('f', binary_file.read(4))[0]
            angle_phi.append(phi)

            x0 = struct.unpack('f', binary_file.read(4))[0]
            coordinate_x.append(x0)

            y0 = struct.unpack('f', binary_file.read(4))[0]
            coordinate_y.append(y0)

            binary_file.read(4 * 5)

            power_eas = struct.unpack('f', binary_file.read(4))[0]
            power.append(math.log10(power_eas))

            age_eas = struct.unpack('f', binary_file.read(4))[0]
            age.append(age_eas)

            binary_file.read(4 * 1565)
            energy_release = struct.unpack('f' * 36, binary_file.read(4 * 36))
            energy.append(energy_release)

            binary_file.read(4)
            t = struct.unpack('f' * 144, binary_file.read(4 * 144))
            threshold_time = t[::4]
            time.append(threshold_time)

    # Собираем всё в DataFrame
    df = pd.DataFrame({
        'power': power,
        'age': age,
        'x': coordinate_x,
        'y': coordinate_y,
        'tetta': angle_tetta,
        'phi': angle_phi,
        'energy': energy,
        'threshold_time': time,
    })

    return df

def evaluate_regression(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")

In [2]:
#Заморозка 42 (52)
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)
random_state = 42

In [3]:
df = read_binarry("/content/drive/MyDrive/Nuclear IT Hack/spe27p_100k_2022_correct.dat")

new_columns = pd.DataFrame(
    df['energy'].tolist(),
    columns=[f'energy_{i}' for i in range(36)]
)

df = pd.concat([df, new_columns], axis=1)

new_columns = pd.DataFrame(
    df['threshold_time'].tolist(),
    columns=[f'threshold_time_{i}' for i in range(36)]
)

df = pd.concat([df, new_columns], axis=1)

df.drop(columns=["energy", "threshold_time"], inplace=True)

X = df.drop(columns=["power", "age", "x", "y", "tetta", "phi"])

In [4]:
!pip install lightgbm



In [5]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [6]:
x_scaler = StandardScaler()
X = x_scaler.fit_transform(X)

In [7]:
y_power = df["power"]

In [8]:
X_train, X_test, y_power_train, y_power_test = train_test_split(X, y_power, test_size=0.3, random_state=random_state)
train_data = lgb.Dataset(X_train, label=y_power_train)
test_data = lgb.Dataset(X_test, label=y_power_test, reference=train_data)

In [9]:
params = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting_type': 'gbdt',
    'num_leaves': 10,
    'learning_rate': 0.07,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1,
    'n_jobs': -1,
    'random_state': 42
}

model_lgb_power = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'valid']
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18360
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 72
[LightGBM] [Info] Start training from score 4.642989


In [10]:
y_power_preds = model_lgb_power.predict(X_test, num_iteration=model_lgb_power.best_iteration)
evaluate_regression(y_power_test, y_power_preds)

RMSE: 0.1202
MAE: 0.0932
R²: 0.9619


In [11]:
y = df["power"]

In [12]:
X_em = []
for i in range(100000):
  ser = X[i]
  row = [[ser[i] for i in range(36)], [ser[i] for i in range(36, 72)]]
  # for i in range(36):
  #   row.append([ser[i], ser[36+i]])
  X_em.append(row)
X_em = np.array(X_em)
X_em = X_em.astype(np.float32)

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np

from sklearn.model_selection import train_test_split

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    X_em, y, test_size=0.2, random_state=42
)

In [15]:
train_dataset = TensorDataset(
    torch.tensor(X_train.astype(np.float32)),
    torch.tensor(y_train.values.astype(np.float32))
)
val_dataset = TensorDataset(
    torch.tensor(X_val.astype(np.float32)),
    torch.tensor(y_val.values.astype(np.float32))
)
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

class TransformerRegressor(nn.Module):
    def __init__(self, num_features=2, d_model=36, nhead=9, num_layers=6, dropout=0.1):
        super().__init__()
        self.pos_encoder = nn.Parameter(torch.randn(1, num_features, d_model))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dropout=dropout, dim_feedforward=4*d_model
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.regressor = nn.Sequential(
            nn.Linear(d_model, 32),
            nn.ReLU(),
            nn.Linear(32, 1))

    def forward(self, x):
        #x = x.unsqueeze(-1)  # [batch, num_features, 1]
        #x = self.embedding(x)  # [batch, num_features, d_model]
        x += self.pos_encoder
        x = x.permute(1, 0, 2)  # [num_features, batch, d_model]
        x = self.encoder(x)  # [num_features, batch, d_model]
        x = x.permute(1, 2, 0)  # [batch, d_model, num_features]
        x = self.pool(x).squeeze(-1)  # [batch, d_model]
        return self.regressor(x).squeeze(-1)  # [batch]

In [16]:
model = TransformerRegressor()



In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [18]:
print(device)

cuda


In [19]:
num_epochs = 9
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item() * X_batch.size(0)

    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"  Train MSE: {train_loss:.6f}, Val MSE: {val_loss:.6f}")
    print(f"  Train RMSE: {np.sqrt(train_loss):.6f}, Val RMSE: {np.sqrt(val_loss):.6f}")

Epoch 1/9:
  Train MSE: 1.033871, Val MSE: 0.061246
  Train RMSE: 1.016794, Val RMSE: 0.247480
Epoch 2/9:
  Train MSE: 0.045042, Val MSE: 0.029442
  Train RMSE: 0.212230, Val RMSE: 0.171587
Epoch 3/9:
  Train MSE: 0.033383, Val MSE: 0.039447
  Train RMSE: 0.182709, Val RMSE: 0.198614
Epoch 4/9:
  Train MSE: 0.030227, Val MSE: 0.042351
  Train RMSE: 0.173858, Val RMSE: 0.205793
Epoch 5/9:
  Train MSE: 0.027945, Val MSE: 0.023719
  Train RMSE: 0.167167, Val RMSE: 0.154010
Epoch 6/9:
  Train MSE: 0.026192, Val MSE: 0.022282
  Train RMSE: 0.161838, Val RMSE: 0.149273
Epoch 7/9:
  Train MSE: 0.024745, Val MSE: 0.020429
  Train RMSE: 0.157307, Val RMSE: 0.142929
Epoch 8/9:
  Train MSE: 0.023378, Val MSE: 0.020233
  Train RMSE: 0.152899, Val RMSE: 0.142242
Epoch 9/9:
  Train MSE: 0.023053, Val MSE: 0.021555
  Train RMSE: 0.151832, Val RMSE: 0.146817


In [20]:
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")


def predict_data(model, data, device):
    model.eval()

    if isinstance(data, pd.DataFrame):
        data = data.values

    X_new_tensor = torch.tensor(data, dtype=torch.float32).to(device)

    with torch.no_grad():
        predictions = model(X_new_tensor)
        return predictions.cpu().numpy()


pred = predict_data(model, X_val, device)
evaluate_regression(pred, y_val)

MSE: 0.0216
R²: 0.9409


^ наверху статистика по power

In [21]:
y = df["x"]

X_em = []
for i in range(100000):
  ser = X[i]
  row = [[ser[i] for i in range(36)], [ser[i] for i in range(36, 72)]]
  # for i in range(36):
  #   row.append([ser[i], ser[36+i]])
  X_em.append(row)
X_em = np.array(X_em)
X_em = X_em.astype(np.float32)


X_train, X_val, y_train, y_val = train_test_split(
    X_em, y, test_size=0.2, random_state=42
)


train_dataset = TensorDataset(
    torch.tensor(X_train.astype(np.float32)),
    torch.tensor(y_train.values.astype(np.float32))
)
val_dataset = TensorDataset(
    torch.tensor(X_val.astype(np.float32)),
    torch.tensor(y_val.values.astype(np.float32))
)
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

model = TransformerRegressor()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

num_epochs = 9
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item() * X_batch.size(0)

    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"  Train MSE: {train_loss:.6f}, Val MSE: {val_loss:.6f}")
    print(f"  Train RMSE: {np.sqrt(train_loss):.6f}, Val RMSE: {np.sqrt(val_loss):.6f}")




Epoch 1/9:
  Train MSE: 252.652076, Val MSE: 145.087123
  Train RMSE: 15.895033, Val RMSE: 12.045212
Epoch 2/9:
  Train MSE: 149.638761, Val MSE: 128.982910
  Train RMSE: 12.232692, Val RMSE: 11.357064
Epoch 3/9:
  Train MSE: 129.084083, Val MSE: 130.753971
  Train RMSE: 11.361518, Val RMSE: 11.434770
Epoch 4/9:
  Train MSE: 120.208522, Val MSE: 115.709928
  Train RMSE: 10.963965, Val RMSE: 10.756855
Epoch 5/9:
  Train MSE: 112.444774, Val MSE: 108.889759
  Train RMSE: 10.603998, Val RMSE: 10.435026
Epoch 6/9:
  Train MSE: 111.688627, Val MSE: 121.822665
  Train RMSE: 10.568284, Val RMSE: 11.037331
Epoch 7/9:
  Train MSE: 107.704162, Val MSE: 112.231896
  Train RMSE: 10.378062, Val RMSE: 10.593956
Epoch 8/9:
  Train MSE: 105.194487, Val MSE: 99.226722
  Train RMSE: 10.256436, Val RMSE: 9.961261
Epoch 9/9:
  Train MSE: 103.412797, Val MSE: 116.155113
  Train RMSE: 10.169208, Val RMSE: 10.777528


In [22]:
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")


def predict_data(model, data, device):
    model.eval()

    if isinstance(data, pd.DataFrame):
        data = data.values

    X_new_tensor = torch.tensor(data, dtype=torch.float32).to(device)

    with torch.no_grad():
        predictions = model(X_new_tensor)
        return predictions.cpu().numpy()


pred = predict_data(model, X_val, device)
evaluate_regression(pred, y_val)

MSE: 116.1551
R²: 0.7341


^ выше статистика по x

In [23]:
y = df["age"]

X_em = []
for i in range(100000):
  ser = X[i]
  row = [[ser[i] for i in range(36)], [ser[i] for i in range(36, 72)]]
  # for i in range(36):
  #   row.append([ser[i], ser[36+i]])
  X_em.append(row)
X_em = np.array(X_em)
X_em = X_em.astype(np.float32)


X_train, X_val, y_train, y_val = train_test_split(
    X_em, y, test_size=0.2, random_state=42
)


train_dataset = TensorDataset(
    torch.tensor(X_train.astype(np.float32)),
    torch.tensor(y_train.values.astype(np.float32))
)
val_dataset = TensorDataset(
    torch.tensor(X_val.astype(np.float32)),
    torch.tensor(y_val.values.astype(np.float32))
)
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

model = TransformerRegressor()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

num_epochs = 9
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item() * X_batch.size(0)

    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"  Train MSE: {train_loss:.6f}, Val MSE: {val_loss:.6f}")
    print(f"  Train RMSE: {np.sqrt(train_loss):.6f}, Val RMSE: {np.sqrt(val_loss):.6f}")




Epoch 1/9:
  Train MSE: 0.023531, Val MSE: 0.004754
  Train RMSE: 0.153397, Val RMSE: 0.068947
Epoch 2/9:
  Train MSE: 0.004190, Val MSE: 0.004387
  Train RMSE: 0.064730, Val RMSE: 0.066231
Epoch 3/9:
  Train MSE: 0.003901, Val MSE: 0.003732
  Train RMSE: 0.062462, Val RMSE: 0.061090
Epoch 4/9:
  Train MSE: 0.003371, Val MSE: 0.004032
  Train RMSE: 0.058058, Val RMSE: 0.063496
Epoch 5/9:
  Train MSE: 0.003223, Val MSE: 0.003246
  Train RMSE: 0.056770, Val RMSE: 0.056972
Epoch 6/9:
  Train MSE: 0.003158, Val MSE: 0.003667
  Train RMSE: 0.056198, Val RMSE: 0.060559
Epoch 7/9:
  Train MSE: 0.003113, Val MSE: 0.003298
  Train RMSE: 0.055798, Val RMSE: 0.057430
Epoch 8/9:
  Train MSE: 0.003096, Val MSE: 0.003452
  Train RMSE: 0.055641, Val RMSE: 0.058751
Epoch 9/9:
  Train MSE: 0.003081, Val MSE: 0.003190
  Train RMSE: 0.055504, Val RMSE: 0.056477


In [24]:
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")


def predict_data(model, data, device):
    model.eval()

    if isinstance(data, pd.DataFrame):
        data = data.values

    X_new_tensor = torch.tensor(data, dtype=torch.float32).to(device)

    with torch.no_grad():
        predictions = model(X_new_tensor)
        return predictions.cpu().numpy()


pred = predict_data(model, X_val, device)
evaluate_regression(pred, y_val)

MSE: 0.0032
R²: -1.5698


In [24]:
# ^это не учитывать, просто эксперимент

In [25]:

# y = df["age"]

# # X_em = []
# # for i in range(100000):
# #   ser = X.loc[i]
# #   row = [[ser[i] for i in range(36)], [ser[i] for i in range(36, 72)]]
# #   # for i in range(36):
# #   #   row.append([ser[i], ser[36+i]])
# #   X_em.append(row)
# X_em = np.array(X_em)
# X_em = X_em.astype(np.float32)


# X_train, X_val, y_train, y_val = train_test_split(
#     X_em, y, test_size=0.2, random_state=42
# )


# train_dataset = TensorDataset(
#     torch.tensor(X_train.astype(np.float32)),
#     torch.tensor(y_train.values.astype(np.float32))
# )
# val_dataset = TensorDataset(
#     torch.tensor(X_val.astype(np.float32)),
#     torch.tensor(y_val.values.astype(np.float32))
# )
# batch_size = 256
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)

# model = TransformerRegressor()

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
# optimizer = optim.Adam(model.parameters(), lr=1e-3)
# criterion = nn.MSELoss()

# num_epochs = 9
# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0
#     for X_batch, y_batch in train_loader:
#         X_batch, y_batch = X_batch.to(device), y_batch.to(device)

#         optimizer.zero_grad()
#         outputs = model(X_batch)
#         loss = criterion(outputs, y_batch)
#         loss.backward()
#         optimizer.step()
#         print(outputs)
#         train_loss += loss.item() * X_batch.size(0)
#         #print(X_batch.size(0), loss.item())
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for X_batch, y_batch in val_loader:
#             X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#             outputs = model(X_batch)
#             loss = criterion(outputs, y_batch)
#             val_loss += loss.item() * X_batch.size(0)
#     print(len(train_loader.dataset), train_loss)
#     train_loss /= len(train_loader.dataset)
#     val_loss /= len(val_loader.dataset)
#     print(f"Epoch {epoch+1}/{num_epochs}:")
#     print(f"  Train MSE: {train_loss:.6f}, Val MSE: {val_loss:.6f}")
#     print(f"  Train RMSE: {np.sqrt(train_loss):.6f}, Val RMSE: {np.sqrt(val_loss):.6f}")

# def evaluate_regression(y_true, y_pred):
#     mse = mean_squared_error(y_true, y_pred)
#     r2 = r2_score(y_true, y_pred)

#     print(f"MSE: {mse:.4f}")
#     print(f"R²: {r2:.4f}")


# def predict_data(model, data, device):
#     model.eval()

#     if isinstance(data, pd.DataFrame):
#         data = data.values

#     X_new_tensor = torch.tensor(data, dtype=torch.float32).to(device)

#     with torch.no_grad():
#         predictions = model(X_new_tensor)
#         return predictions.cpu().numpy()


# pred = predict_data(model, X_val, device)
# evaluate_regression(pred, y_val)

In [26]:
import math
X = df.drop(columns=["power", "age", "x", "y", "tetta", "phi"])
y = df['age']
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.datasets import fetch_california_housing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
    'objective': 'regression',       # Тип задачи - регрессия
    'metric': 'rmse',                # Метрика качества (Root Mean Squared Error)
    'boosting_type': 'gbdt',         # Алгоритм градиентного бустинга
    'num_leaves': 31,                # Максимальное количество листьев в дереве
    'learning_rate': 0.05,           # Скорость обучения
    'feature_fraction': 0.9,         # Доля случайно выбираемых признаков на каждой итерации
    'bagging_fraction': 0.8,         # Доля данных для бутстрепа
    'bagging_freq': 6,               # Частота бэггинга
    'verbose': 1,                    # Отключение выводов
    'n_jobs': 1,                    # Использовать все ядра процессора
    'random_state': 42               # Для воспроизводимости
}
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,            # Максимальное количество деревьев
    valid_sets=[train_data, test_data],
    valid_names=['train', 'valid']
)
y_preds = model.predict(X_test, num_iteration=model.best_iteration)
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")

metrics = evaluate_regression(y_test, y_preds)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18360
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 72
[LightGBM] [Info] Start training from score 1.367758
MSE: 0.0029
R²: 0.3065


^ выше статистики для age

^ выше статистики для x

In [27]:
import math
X = df.drop(columns=["power", "age", "x", "y", "tetta", "phi"])
y = df['y']
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.datasets import fetch_california_housing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
    'objective': 'regression',       # Тип задачи - регрессия
    'metric': 'rmse',                # Метрика качества (Root Mean Squared Error)
    'boosting_type': 'gbdt',         # Алгоритм градиентного бустинга
    'num_leaves': 31,                # Максимальное количество листьев в дереве
    'learning_rate': 0.05,           # Скорость обучения
    'feature_fraction': 0.9,         # Доля случайно выбираемых признаков на каждой итерации
    'bagging_fraction': 0.8,         # Доля данных для бутстрепа
    'bagging_freq': 6,               # Частота бэггинга
    'verbose': 1,                    # Отключение выводов
    'n_jobs': 1,                    # Использовать все ядра процессора
    'random_state': 42               # Для воспроизводимости
}
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,            # Максимальное количество деревьев
    valid_sets=[train_data, test_data],
    valid_names=['train', 'valid']
)
y_preds = model.predict(X_test, num_iteration=model.best_iteration)
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")

metrics = evaluate_regression(y_test, y_preds)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18360
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 72
[LightGBM] [Info] Start training from score -2.953094
MSE: 158.4119
R²: 0.8966


^ выше статистики для y

In [28]:
import math
X = df.drop(columns=["power", "age", "x", "y", "tetta", "phi"])
y = (df['tetta']*math.pi/180).agg('cos') #pd.concat([prep1,prep2], axis=1)
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.datasets import fetch_california_housing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
    'objective': 'regression',       # Тип задачи - регрессия
    'metric': 'rmse',                # Метрика качества (Root Mean Squared Error)
    'boosting_type': 'gbdt',         # Алгоритм градиентного бустинга
    'num_leaves': 31,                # Максимальное количество листьев в дереве
    'learning_rate': 0.05,           # Скорость обучения
    'feature_fraction': 0.9,         # Доля случайно выбираемых признаков на каждой итерации
    'bagging_fraction': 0.8,         # Доля данных для бутстрепа
    'bagging_freq': 6,               # Частота бэггинга
    'verbose': 1,                    # Отключение выводов
    'n_jobs': 1,                    # Использовать все ядра процессора
    'random_state': 42               # Для воспроизводимости
}
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,            # Максимальное количество деревьев
    valid_sets=[train_data, test_data],
    valid_names=['train', 'valid']
)
y_preds = model.predict(X_test, num_iteration=model.best_iteration)
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")

metrics = evaluate_regression(y_test, y_preds)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18360
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 72
[LightGBM] [Info] Start training from score 0.821163
MSE: 0.0045
R²: 0.5818


^ выше статистика по tetta

In [29]:
import math
X = df.drop(columns=["power", "age", "x", "y", "tetta", "phi"])
y = (df['phi']*math.pi/180).agg('cos') #pd.concat([prep1,prep2], axis=1)
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.datasets import fetch_california_housing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
    'objective': 'regression',       # Тип задачи - регрессия
    'metric': 'rmse',                # Метрика качества (Root Mean Squared Error)
    'boosting_type': 'gbdt',         # Алгоритм градиентного бустинга
    'num_leaves': 31,                # Максимальное количество листьев в дереве
    'learning_rate': 0.05,           # Скорость обучения
    'feature_fraction': 0.9,         # Доля случайно выбираемых признаков на каждой итерации
    'bagging_fraction': 0.8,         # Доля данных для бутстрепа
    'bagging_freq': 6,               # Частота бэггинга
    'verbose': 1,                    # Отключение выводов
    'n_jobs': 1,                    # Использовать все ядра процессора
    'random_state': 42               # Для воспроизводимости
}
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,            # Максимальное количество деревьев
    valid_sets=[train_data, test_data],
    valid_names=['train', 'valid']
)
y_preds = model.predict(X_test, num_iteration=model.best_iteration)
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")

metrics = evaluate_regression(y_test, y_preds)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18360
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 72
[LightGBM] [Info] Start training from score -0.001162
MSE: 0.3648
R²: 0.2739


^ выше статистика по phi

In [30]:
# ^ выше статистика для phi