In [22]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [23]:
df_train = pd.read_csv('datasets/train_data.csv')
df_test = pd.read_csv('datasets/test_data.csv')

In [24]:
X_train = df_train.iloc[:, :-1].values  # All columns except the last as features
y_train = df_train.iloc[:, -1].values   # Last column as target
X_predict = df_test.iloc[:, 1:].values   # All columns except the first as features

In [4]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [25]:
# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [26]:
# Define model parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,  # Control the complexity of the tree
    'max_depth': -1,   # No limit on depth
    'feature_fraction': 0.8,  # Randomly sample features for better generalization
    'bagging_fraction': 0.8,  # Randomly sample data for better generalization
    'bagging_freq': 5,        # Perform bagging every 5 iterations
    'verbose': -1
}

In [27]:
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[test_data],
    #early_stopping_rounds=50,
    #verbose_eval=50
)

In [28]:
# Make predictions
y_pred = model.predict(X_predict, num_iteration=model.best_iteration)

In [21]:
# Evaluate performance
rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE: {rmse}')

RMSE: 0.42102126389489314


In [29]:
predictions_df = pd.DataFrame(y_pred)
column_from_df1 = df_test["Unnamed: 0"]
column_from_df2 = predictions_df.iloc[:, -1]
predictions_df = pd.concat([column_from_df1, column_from_df2], axis=1)
predictions_df.columns = ['id', 'target_feature']
predictions_df

Unnamed: 0,id,target_feature
0,2016,8.256529
1,2017,8.448499
2,2018,8.503201
3,2019,8.025662
4,2020,7.420565
...,...,...
64507,193531,5.601156
64508,193532,5.277357
64509,193533,5.337293
64510,193534,5.519867


In [30]:
# Save submission as CSV file
predictions_df.to_csv('D:\\Python\\Predict the wind speed at a wind turbine\\submissions\\oleg_bissing_submission_8.csv', index=False)