In [31]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
X_train = pd.read_csv('processed_data/knn/processed_train_knn.csv')
y_train = pd.read_csv('data/train.csv')
y_train = y_train["yield_strength"]

X_valid = pd.read_csv('processed_data/knn/processed_validation_knn.csv')
y_valid = pd.read_csv('data/validation.csv')
y_valid = y_valid["yield_strength"]

X_test = pd.read_csv('processed_data/knn/processed_test_knn.csv')
y_test = pd.read_csv('data/test.csv')
y_test = y_test["yield_strength"]

# Separate labeled and unlabeled data
X_labeled = X_train[y_train.notna()]  # Only entries with labels
y_labeled = y_train[y_train.notna()]

# Create a set of unlabeled data
X_unlabeled = X_train[y_train.isna()]  # Only entries without labels

# Function to perform self-training
def self_training(X_labeled, y_labeled, X_unlabeled, model, max_iterations=10):
    # Train model with labeled data
    model.fit(X_labeled, y_labeled)
    
    for i in range(max_iterations):
        # Predict labels for the unlabeled data
        y_unlabeled_pred = model.predict(X_unlabeled)

        # Add reliable predictions to the labeled dataset
        # For the example, we consider all predictions, but you may add a confidence criterion
        X_new = X_unlabeled.copy()
        y_new = pd.Series(y_unlabeled_pred)

        X_labeled = pd.concat([X_labeled, X_new])
        y_labeled = pd.concat([y_labeled, y_new])

        # Remove already used unlabeled data
        X_unlabeled = pd.DataFrame()  # Here we are emptying it since we've used all data

        # Re-train the model
        model.fit(X_labeled, y_labeled)

        # Check if there are more unlabeled data to add
        if len(X_unlabeled) == 0:
            print("No remaining unlabeled data.")
            break
    
    return model, X_labeled, y_labeled

# Initialize the LGBMRegressor model
model_lgb_self = lgb.LGBMRegressor()

model_lgb_self, X_labeled, y_labeled = self_training(X_labeled, y_labeled, X_unlabeled, model_lgb_self)

# Function to calculate the metrics
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    return rmse, r2

# Evaluate the final model using validation data (ignoring missing values)
if y_valid.notna().any():
    rmse_valid, r2_valid = evaluate_model(model_lgb_self, X_valid[y_valid.notna()], y_valid[y_valid.notna()])
    print("Validation - RMSE:", rmse_valid)
    print("Validation - R²:", r2_valid)
else:
    print("Validation set does not contain valid labels.")

# Evaluate the final model using test data (ignoring missing values)
if y_test.notna().any():
    rmse_test, r2_test = evaluate_model(model_lgb_self, X_test[y_test.notna()], y_test[y_test.notna()])
    print("Test - RMSE:", rmse_test)
    print("Test - R²:", r2_test)
else:
    print("Test set does not contain valid labels.")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2822
[LightGBM] [Info] Number of data points in the train set: 551, number of used features: 42
[LightGBM] [Info] Start training from score 506.534301
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4147
[LightGBM] [Info] Number of data points in the train set: 1156, number of used features: 42
[LightGBM] [Info] Start training from score 501.415304
No remaining unlabeled data.
Validation - RMSE: 41.0633832450362
Validation - R²: 0.8126382545667101
Test - RMSE: 39.78373232671887
Test - R²: 0.826745015159151


In [33]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
X_train = pd.read_csv('processed_data/knn/processed_train_knn.csv')
y_train = pd.read_csv('data/train.csv')
y_train = y_train["yield_strength"]

X_valid = pd.read_csv('processed_data/knn/processed_validation_knn.csv')
y_valid = pd.read_csv('data/validation.csv')
y_valid = y_valid["yield_strength"]

X_test = pd.read_csv('processed_data/knn/processed_test_knn.csv')
y_test = pd.read_csv('data/test.csv')
y_test = y_test["yield_strength"]

# Separate labeled and unlabeled data
X_labeled = X_train[y_train.notna()]  # Only entries with labels
y_labeled = y_train[y_train.notna()]

# Create an unlabeled dataset
X_unlabeled = X_train[y_train.isna()]  # Only entries without labels

# Function to train the model and evaluate
def train_and_evaluate(X_train, y_train, X_valid, y_valid):
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    r2 = r2_score(y_valid, y_pred)
    return model, rmse, r2

# 1. Training only with labeled data
model_trained, rmse_trained, r2_trained = train_and_evaluate(X_labeled, y_labeled, X_valid[y_valid.notna()], y_valid[y_valid.notna()])
print("LGBMRegressor model trained only with labeled data:")
print("Validation - RMSE:", rmse_trained)
print("Validation - R²:", r2_trained)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2822
[LightGBM] [Info] Number of data points in the train set: 551, number of used features: 42
[LightGBM] [Info] Start training from score 506.534301
LGBMRegressor model trained only with labeled data:
Validation - RMSE: 41.61182901852564
Validation - R²: 0.8075999956818737


In [32]:
from xgboost import XGBRegressor

model = XGBRegressor(objective='reg:squarederror')

model, X_labeled, y_labeled = self_training(X_labeled, y_labeled, X_unlabeled, model)

# Function to calculate the metrics
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    return rmse, r2

# Evaluate the final model using validation data (ignoring missing values)
if y_valid.notna().any():
    rmse_valid, r2_valid = evaluate_model(model, X_valid[y_valid.notna()], y_valid[y_valid.notna()])
    print("Validation - RMSE:", rmse_valid)
    print("Validation - R²:", r2_valid)
else:
    print("Validation set does not contain valid labels.")

# Evaluate the final model using test data (ignoring missing valu


No remaining unlabeled data.
Validation - RMSE: 42.92613158144216
Validation - R²: 0.7952542141290901


In [34]:
from xgboost import XGBRegressor

def train_and_evaluate(X_train, y_train, X_valid, y_valid):
    model = XGBRegressor(objective='reg:squarederror')
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    r2 = r2_score(y_valid, y_pred)
    return model, rmse, r2

# 1. Training only with labeled data
model_trained, rmse_trained, r2_trained = train_and_evaluate(X_labeled, y_labeled, X_valid[y_valid.notna()], y_valid[y_valid.notna()])
print("XGBoost model trained only with labeled data:")
print("Validation - RMSE:", rmse_trained)
print("Validation - R²:", r2_trained)


XGBoost model trained only with labeled data:
Validation - RMSE: 45.91285645046946
Validation - R²: 0.7657713081296543
