In [1]:
import pandas as pd

In [None]:
dataset = pd.read_excel('Dataset.xlsx')
dataset = dataset.drop(0)

print(dataset)

In [None]:
data_no_nan = dataset.dropna(axis=1)

print(data_no_nan)

In [24]:
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, TensorDataset

In [5]:
# Preprocess the data
sequence_length = 3
data = []
target = []

In [6]:
for i in range(len(dataset) - sequence_length):
    sequence = dataset['Wind speed [m/s]'].iloc[i:i + sequence_length].tolist()
    label = dataset['Wind speed [m/s]'].iloc[i + sequence_length]
    data.append(sequence)
    target.append(label)

In [7]:
# Convert data & target 
data = torch.tensor(data)
target = torch.tensor(target)

In [8]:
# Split Data
train_data, test_data, train_labels, test_labels = train_test_split(data, target, test_size=0.3, random_state=42)

In [9]:
# Tokenization
# Convert numeric ke string
train_data_text = [" ".join(map(str, seq)) for seq in train_data.tolist()]
test_data_text = [" ".join(map(str, seq)) for seq in test_data.tolist()]

In [10]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') 

In [11]:
train_encodings = tokenizer(train_data_text, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_data_text, padding=True, truncation=True, return_tensors="pt")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
# Prepare DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], torch.tensor(train_labels))
test_dataset = TensorDataset(test_encodings['input_ids'], torch.tensor(test_labels))

  train_dataset = TensorDataset(train_encodings['input_ids'], torch.tensor(train_labels))
  test_dataset = TensorDataset(test_encodings['input_ids'], torch.tensor(test_labels))


In [13]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [20]:
#Modeling
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=1)  # For regression, num_labels=1

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
#Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs, labels=labels.float().unsqueeze(1))
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track the loss
        total_loss += loss.item()

        # Print loss for each batch
        print(f"Epoch [{epoch + 1}/{epochs}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    # Calculate and print average loss per epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{epochs}] completed. Average Loss: {avg_loss:.4f}\n")



Epoch [1/3], Batch [1/4599], Loss: 25.4686
Epoch [1/3], Batch [2/4599], Loss: 32.5484
Epoch [1/3], Batch [3/4599], Loss: 18.5912
Epoch [1/3], Batch [4/4599], Loss: 26.2357
Epoch [1/3], Batch [5/4599], Loss: 10.0833
Epoch [1/3], Batch [6/4599], Loss: 29.9962
Epoch [1/3], Batch [7/4599], Loss: 29.7978
Epoch [1/3], Batch [8/4599], Loss: 10.7189
Epoch [1/3], Batch [9/4599], Loss: 5.9619
Epoch [1/3], Batch [10/4599], Loss: 8.8469
Epoch [1/3], Batch [11/4599], Loss: 18.7330
Epoch [1/3], Batch [12/4599], Loss: 10.8873
Epoch [1/3], Batch [13/4599], Loss: 9.5003
Epoch [1/3], Batch [14/4599], Loss: 8.9322
Epoch [1/3], Batch [15/4599], Loss: 8.1045
Epoch [1/3], Batch [16/4599], Loss: 10.9216
Epoch [1/3], Batch [17/4599], Loss: 19.5442
Epoch [1/3], Batch [18/4599], Loss: 11.8749
Epoch [1/3], Batch [19/4599], Loss: 8.3649
Epoch [1/3], Batch [20/4599], Loss: 23.8446
Epoch [1/3], Batch [21/4599], Loss: 10.8634
Epoch [1/3], Batch [22/4599], Loss: 10.0870
Epoch [1/3], Batch [23/4599], Loss: 11.7298
Epo

In [49]:
# Mode evaluasi untuk prediksi pada data validasi
model.eval()
predictions = []
actuals = []

# Mengumpulkan prediksi dan label aktual
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = inputs.to(device)
        
        # Mendapatkan prediksi
        outputs = model(inputs)
        logits = outputs.logits
        
        # Binarisasi hasil prediksi (misalnya, ambil argmax untuk multi-kelas atau threshold untuk biner)
        pred_labels = torch.argmax(logits, dim=1) if logits.shape[1] > 1 else (logits > 0).int()
        predictions.extend(pred_labels.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

In [50]:
def my_sigmod(x: int) -> int:
    if x > 5:
        return 1 
    else:
        return 0

predictions = [int(x[0]) for x in predictions]
actuals = [my_sigmod(x) for x in actuals]

# Menghitung metrik
precision = precision_score(actuals, predictions, average='binary')  # Ubah ke 'micro', 'macro', atau 'weighted' untuk multi-kelas
recall = recall_score(actuals, predictions, average='binary')
f1 = f1_score(actuals, predictions, average='binary')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.5709
Recall: 1.0000
F1 Score: 0.7268
