In [1]:
from pprint import pprint
from pathlib import Path
import numpy as np  # для чисел и вычислений
import pandas as pd  # для таблиц (как Excel в Python)
import matplotlib.pyplot as plt  # для графиков
import seaborn as sns  # для красивых графиков

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from clearing import DatasetCleaner  # soft link


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler


In [3]:
MAIN_FOLDER = "/home/arman/it/AI_work/machine/melting_point"

In [4]:
# Читаем файлы
train_data = pd.read_csv(f"{MAIN_FOLDER}/data/train.csv")  # данные для обучения
test_data = pd.read_csv(f"{MAIN_FOLDER}/data/test.csv")  # данные для теста
sample_sub = pd.read_csv(f"{MAIN_FOLDER}/data/sample_submission.csv")  # шаблон для ответа

# Посмотрим что у нас есть
print(f"Размер обучающих данных: {train_data.shape}")
print(f"Размер тестовых данных: {test_data.shape}")
print("Первые 3 строки обучающих данных:")
print(train_data.head(3))
print("Колонки в данных:")
print(train_data.columns.tolist()[:10], "...")  # первые 10 колонок

Размер обучающих данных: (2662, 427)
Размер тестовых данных: (666, 426)
Первые 3 строки обучающих данных:
     id                       SMILES      Tm  Group 1  Group 2  Group 3  \
0  2175        FC1=C(F)C(F)(F)C1(F)F  213.15        0        0        0   
1  1222  c1ccc2c(c1)ccc3Nc4ccccc4c23  407.15        0        0        0   
2  2994          CCN1C(C)=Nc2ccccc12  324.15        2        1        0   

   Group 4  Group 5  Group 6  Group 7  ...  Group 415  Group 416  Group 417  \
0        0        0        0        0  ...          0          0          0   
1        0        0        0        0  ...          0          0          0   
2        0        0        0        0  ...          0          0          0   

   Group 418  Group 419  Group 420  Group 421  Group 422  Group 423  Group 424  
0          0          0          0          0          0          0          0  
1          0          0          0          0          0          0          0  
2          0          0          

In [5]:
train_cleaner = DatasetCleaner(train_data)

In [6]:
cols_to_remove = ["SMILES"]

for col in train_cleaner.current_df.columns.to_list():
    t = train_cleaner.count_missing_and_zeros(column=col)
    if t[1] == t[2]:
        cols_to_remove.append(col)
        print("DELETE", col, t)
# нулей нет

DELETE Group 12 (np.int64(0), np.int64(2662), 2662)
DELETE Group 28 (np.int64(0), np.int64(2662), 2662)
DELETE Group 46 (np.int64(0), np.int64(2662), 2662)
DELETE Group 67 (np.int64(0), np.int64(2662), 2662)
DELETE Group 73 (np.int64(0), np.int64(2662), 2662)
DELETE Group 74 (np.int64(0), np.int64(2662), 2662)
DELETE Group 75 (np.int64(0), np.int64(2662), 2662)
DELETE Group 84 (np.int64(0), np.int64(2662), 2662)
DELETE Group 85 (np.int64(0), np.int64(2662), 2662)
DELETE Group 88 (np.int64(0), np.int64(2662), 2662)
DELETE Group 90 (np.int64(0), np.int64(2662), 2662)
DELETE Group 101 (np.int64(0), np.int64(2662), 2662)
DELETE Group 102 (np.int64(0), np.int64(2662), 2662)
DELETE Group 104 (np.int64(0), np.int64(2662), 2662)
DELETE Group 150 (np.int64(0), np.int64(2662), 2662)
DELETE Group 152 (np.int64(0), np.int64(2662), 2662)
DELETE Group 155 (np.int64(0), np.int64(2662), 2662)
DELETE Group 158 (np.int64(0), np.int64(2662), 2662)
DELETE Group 160 (np.int64(0), np.int64(2662), 2662)
DELE

In [7]:
print(cols_to_remove)

['SMILES', 'Group 12', 'Group 28', 'Group 46', 'Group 67', 'Group 73', 'Group 74', 'Group 75', 'Group 84', 'Group 85', 'Group 88', 'Group 90', 'Group 101', 'Group 102', 'Group 104', 'Group 150', 'Group 152', 'Group 155', 'Group 158', 'Group 160', 'Group 167', 'Group 183', 'Group 194', 'Group 198', 'Group 206', 'Group 207', 'Group 208', 'Group 209', 'Group 212', 'Group 213', 'Group 214', 'Group 215', 'Group 216', 'Group 217', 'Group 218', 'Group 245', 'Group 247', 'Group 248', 'Group 250', 'Group 252', 'Group 253', 'Group 264', 'Group 280', 'Group 281', 'Group 282', 'Group 285', 'Group 294', 'Group 303', 'Group 306', 'Group 307', 'Group 308', 'Group 309', 'Group 312', 'Group 313', 'Group 316', 'Group 317', 'Group 340', 'Group 342', 'Group 345', 'Group 347', 'Group 348', 'Group 349', 'Group 350', 'Group 352', 'Group 355', 'Group 356', 'Group 357', 'Group 358', 'Group 360', 'Group 363', 'Group 371', 'Group 376', 'Group 377', 'Group 383', 'Group 384', 'Group 385', 'Group 390', 'Group 397',

In [8]:
train_cleaner.find_duplicates()
# дубликатов нет

Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424


In [9]:
for col in cols_to_remove:
    train_cleaner.remove_row_or_column(column=col)
train_cleaner.current_df.head()

Unnamed: 0,id,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,...,Group 406,Group 407,Group 408,Group 409,Group 410,Group 412,Group 414,Group 415,Group 416,Group 418
0,2175,213.15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1222,407.15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2994,324.15,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1704,351.15,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2526,126.15,2,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def df_algorithm(df):
    df_cleaner = DatasetCleaner(df)
    for col in cols_to_remove:
        df_cleaner.remove_row_or_column(column=col)
    return df_cleaner.current_df

In [11]:
print(len(cols_to_remove), cols_to_remove)
train_cleaner.save_to_csv(f"{MAIN_FOLDER}/data/prepared_train.csv")

88 ['SMILES', 'Group 12', 'Group 28', 'Group 46', 'Group 67', 'Group 73', 'Group 74', 'Group 75', 'Group 84', 'Group 85', 'Group 88', 'Group 90', 'Group 101', 'Group 102', 'Group 104', 'Group 150', 'Group 152', 'Group 155', 'Group 158', 'Group 160', 'Group 167', 'Group 183', 'Group 194', 'Group 198', 'Group 206', 'Group 207', 'Group 208', 'Group 209', 'Group 212', 'Group 213', 'Group 214', 'Group 215', 'Group 216', 'Group 217', 'Group 218', 'Group 245', 'Group 247', 'Group 248', 'Group 250', 'Group 252', 'Group 253', 'Group 264', 'Group 280', 'Group 281', 'Group 282', 'Group 285', 'Group 294', 'Group 303', 'Group 306', 'Group 307', 'Group 308', 'Group 309', 'Group 312', 'Group 313', 'Group 316', 'Group 317', 'Group 340', 'Group 342', 'Group 345', 'Group 347', 'Group 348', 'Group 349', 'Group 350', 'Group 352', 'Group 355', 'Group 356', 'Group 357', 'Group 358', 'Group 360', 'Group 363', 'Group 371', 'Group 376', 'Group 377', 'Group 383', 'Group 384', 'Group 385', 'Group 390', 'Group 39

In [12]:
train_data = pd.read_csv(f"{MAIN_FOLDER}/data/prepared_train.csv")  # данные для обучения
test_data = pd.read_csv(f"{MAIN_FOLDER}/data/test.csv")  # данные для теста


In [13]:
# Prepare features and target
features = list(set(train_data.columns.to_list()) - set(cols_to_remove) - {"id", "Tm"})
X = train_data[features].values
y = train_data['Tm'].values

# Split into train and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Prepare test data
X_test = test_data[features].values
X_test = scaler.transform(X_test)

In [14]:
print(sorted(features))

['Group 1', 'Group 10', 'Group 100', 'Group 103', 'Group 105', 'Group 106', 'Group 107', 'Group 108', 'Group 109', 'Group 11', 'Group 110', 'Group 111', 'Group 112', 'Group 113', 'Group 114', 'Group 115', 'Group 116', 'Group 117', 'Group 118', 'Group 119', 'Group 120', 'Group 121', 'Group 122', 'Group 123', 'Group 124', 'Group 125', 'Group 126', 'Group 127', 'Group 128', 'Group 129', 'Group 13', 'Group 130', 'Group 131', 'Group 132', 'Group 133', 'Group 134', 'Group 135', 'Group 136', 'Group 137', 'Group 138', 'Group 139', 'Group 14', 'Group 140', 'Group 141', 'Group 142', 'Group 143', 'Group 144', 'Group 145', 'Group 146', 'Group 147', 'Group 148', 'Group 149', 'Group 15', 'Group 151', 'Group 153', 'Group 154', 'Group 156', 'Group 157', 'Group 159', 'Group 16', 'Group 161', 'Group 162', 'Group 163', 'Group 164', 'Group 165', 'Group 166', 'Group 168', 'Group 169', 'Group 17', 'Group 170', 'Group 171', 'Group 172', 'Group 173', 'Group 174', 'Group 175', 'Group 176', 'Group 177', 'Group 

In [15]:
# Define Dataset
class MeltingPointDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

# Create datasets and dataloaders
train_dataset = MeltingPointDataset(X_train, y_train)
val_dataset = MeltingPointDataset(X_val, y_val)
test_dataset = MeltingPointDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [16]:
# Define Neural Network
class MeltingPointNet(nn.Module):
    def __init__(self, input_size):
        super(MeltingPointNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

# Initialize model, loss, optimizer
input_size = X_train.shape[1]
model = MeltingPointNet(input_size)
criterion = nn.L1Loss()  # MAE loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

In [17]:
# Training loop
num_epochs = 30
best_val_mae = float('inf')

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs.squeeze(), y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(X_batch)
    train_loss /= len(train_loader.dataset)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            val_loss += loss.item() * len(X_batch)
    val_loss /= len(val_loader.dataset)

    scheduler.step(val_loss)

    if val_loss < best_val_mae:
        best_val_mae = val_loss
        torch.save(model.state_dict(), f'{MAIN_FOLDER}/data/best_model.pth')

    print(f'Epoch {epoch+1}/{num_epochs}, Train MAE: {train_loss:.4f}, Val MAE: {val_loss:.4f}')



Epoch 1/30, Train MAE: 276.7826, Val MAE: 278.3929
Epoch 2/30, Train MAE: 274.4177, Val MAE: 275.4831
Epoch 3/30, Train MAE: 270.9923, Val MAE: 270.7485
Epoch 4/30, Train MAE: 265.8868, Val MAE: 263.4540
Epoch 5/30, Train MAE: 258.9938, Val MAE: 255.8448
Epoch 6/30, Train MAE: 250.4903, Val MAE: 245.9916
Epoch 7/30, Train MAE: 240.4913, Val MAE: 233.1602
Epoch 8/30, Train MAE: 228.5797, Val MAE: 222.2787
Epoch 9/30, Train MAE: 215.1889, Val MAE: 206.1343
Epoch 10/30, Train MAE: 200.4415, Val MAE: 189.9939
Epoch 11/30, Train MAE: 184.0193, Val MAE: 174.5872
Epoch 12/30, Train MAE: 166.7946, Val MAE: 157.1128
Epoch 13/30, Train MAE: 148.9651, Val MAE: 141.1446
Epoch 14/30, Train MAE: 130.3388, Val MAE: 125.4504
Epoch 15/30, Train MAE: 113.1321, Val MAE: 102.6895
Epoch 16/30, Train MAE: 96.4539, Val MAE: 85.6343
Epoch 17/30, Train MAE: 80.0286, Val MAE: 75.8751
Epoch 18/30, Train MAE: 66.9681, Val MAE: 60.9732
Epoch 19/30, Train MAE: 58.7969, Val MAE: 55.6933
Epoch 20/30, Train MAE: 50.92

In [18]:
# Load best model
model.load_state_dict(torch.load(f'{MAIN_FOLDER}/data/best_model.pth'))

<All keys matched successfully>

In [19]:
# Predict on test
model.eval()
predictions = []
with torch.no_grad():
    for X_batch in test_loader:
        outputs = model(X_batch)
        predictions.extend(outputs.squeeze().cpu().numpy())

# Create submission
submission = pd.DataFrame({
    'id': test_data['id'],
    'Tm': predictions
})
submission.to_csv(f'{MAIN_FOLDER}/data/nn_submission.csv', index=False)