<a href="https://colab.research.google.com/github/ofakduman/Lstm-forecasting/blob/main/lstm_predict_by_bacteria.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/LSTM_forecasting/final_data.csv")

In [None]:
df.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
organism_columns = ['Organism_Bacillus cereus', 'Organism_Listeria monocytogenes/innocua', 'Organism_Pseudomonads', 'Organism_Salmonella spp']

organism_counts = df[organism_columns].sum()

# Histogramı çiz
organism_counts.plot(kind='bar')
plt.title('Organism Türlerinin Dağılımı')
plt.xlabel('Organism Türü')
plt.ylabel('Frekans')
plt.xticks(rotation=45)
plt.show()


In [None]:
# 'Organism_Bacillus cereus' sütunu 1 olan satırları seç
bacillus_cereus_df = df[df['Organism_Bacillus cereus'] == 1]

# Sonuçları göster
bacillus_cereus_df.head(10)


In [None]:
from sklearn.model_selection import train_test_split

# df DataFrame'ini ilk olarak eğitim ve test setlerine ayır
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# df_train DataFrame'ini daha sonra eğitim ve doğrulama setlerine ayır
df_train_partial, df_val = train_test_split(df_train, test_size=0.2, random_state=42)


In [None]:
import numpy as np
def prepare_data(df):
    X, y = [], []
    exclude_columns = ['Record ID', 'Total Values', 'Increases', 'Decreases', 'Logcs'] + organism_columns

    # all_features listesini oluşturma
    all_features = [col for col in df.columns if col not in exclude_columns]

    for index, row in df.iterrows():
        logcs_data = row['Logcs']

        # logcs_data'nın formatını kontrol et
        if isinstance(logcs_data, str):
            # Eğer string ise, uygun formata dönüştür
            logcs_data = eval(logcs_data)

        # all_features kullanarak özellikleri seç
        features = row[all_features].values

        if len(logcs_data) > 5:
            X_input = []
            for i in range(5):
                # Hem zamanı hem de değeri ekle
                X_input.extend([logcs_data[i][0], logcs_data[i][1]])
                X_input.extend(features)

            y_output = [logcs_data[i][1] for i in range(5, len(logcs_data))]

            X.append(X_input)
            y.append(y_output)

    feature_len = 2 + len(all_features)  # Her zaman adımı için zaman ve değer
    time_steps = 5
    return np.array(X).reshape(-1, time_steps, feature_len), np.array(y, dtype=object)


# Modelin geri kalan kısmı aynı kalabilir
X_train, y_train = prepare_data(df_train_partial)
X_val, y_val = prepare_data(df_val)
X_test, y_test = prepare_data(df_test)


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed
import numpy as np

# Maksimum çıktı uzunluğunu hesapla
max_output_length = max([len(yi) for yi in y_train])

# Encoder
encoder_inputs = Input(shape=(5, len(X_train[0][0])))  # 5 zaman adımı, özellik sayısı
encoder_lstm = LSTM(100, return_state=True, return_sequences=True)  # Nöron sayısını artırdık
encoder_lstm_2 = LSTM(100, return_state=True)  # İkinci bir LSTM katmanı ekledik
encoder_outputs, state_h, state_c = encoder_lstm_2(encoder_lstm(encoder_inputs))
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = RepeatVector(max_output_length)(encoder_outputs)
decoder_lstm = LSTM(100, return_sequences=True, return_state=True)  # Nöron sayısını artırdık
decoder_lstm_2 = LSTM(100, return_sequences=True, return_state=False)  # İkinci bir LSTM katmanı ekledik
decoder_outputs = decoder_lstm_2(decoder_lstm(decoder_inputs, initial_state=encoder_states))
decoder_dense = TimeDistributed(Dense(1))  # Her zaman adımı için bir tahmin
decoder_outputs = decoder_dense(decoder_outputs)


# Model
model = Model(inputs=encoder_inputs, outputs=decoder_outputs)

# Model Derleme
model.compile(optimizer='adam', loss='mean_squared_error')

# Model Özeti
model.summary()

In [None]:
import tensorflow as tf
from sklearn.metrics import mean_squared_error

optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.MeanSquaredError()

epochs = 20  # Toplam epoch sayısı
train_losses = []
val_losses = []

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_train_loss = 0
    total_val_loss = 0

    # Eğitim döngüsü
    for i in range(len(X_train)):
        with tf.GradientTape() as tape:
            y_pred = model(X_train[i:i+1], training=True)
            train_loss = loss_fn(y_train[i], y_pred[0, :len(y_train[i]), 0])
        gradients = tape.gradient(train_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        total_train_loss += train_loss.numpy()

    # Doğrulama döngüsü
    for i in range(len(X_val)):
        # X_val[i:i+1] dizisini tensöre dönüştür
        X_val_tensor = tf.convert_to_tensor(X_val[i:i+1], dtype=tf.float32)
        y_pred_val = model.predict(X_val_tensor)

        # y_val[i] dizisini tensöre dönüştür
        y_val_tensor = tf.convert_to_tensor(y_val[i], dtype=tf.float32)

        # Tahmin ve gerçek değerlerin boyutlarını eşitle
        min_length = min(y_pred_val.shape[1], len(y_val[i]))
        val_loss = loss_fn(y_val_tensor[:min_length], y_pred_val[0, :min_length, 0])

        total_val_loss += val_loss.numpy()

    # Ortalama kayıpları hesapla ve kaydet
    avg_train_loss = total_train_loss / len(X_train)
    avg_val_loss = total_val_loss / len(X_val)
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    print(f"Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

# Kayıpları grafik üzerinde görselleştirme
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Train & Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
def calculate_metrics(y_true, y_pred):
    mse_values = []
    mae_values = []
    r2_values = []

    for i in range(len(y_true)):
        true_values = np.array(y_true[i])
        pred_values = y_pred[i, :len(y_true[i]), 0]

        mse = mean_squared_error(true_values, pred_values)
        mae = mean_absolute_error(true_values, pred_values)

        mse_values.append(mse)
        mae_values.append(mae)

        # R2 skorunu yalnızca birden fazla değeri olan örnekler için hesap
        if len(true_values) > 1:
            r2 = r2_score(true_values, pred_values)
            r2_values.append(r2)

    return np.mean(mse_values), np.mean(mae_values), np.mean(r2_values) if r2_values else np.nan

# Hata metriklerini hesaplama
mse, mae, r2 = calculate_metrics(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import matplotlib.pyplot as plt
import random

selected_indices = random.sample(range(len(y_test)), 55)

for index in selected_indices:
    plt.figure(figsize=(8, 4))  # Grafik boyutu
    plt.plot(y_test[index], marker='o', label='y_test')

    # y_pred verisini y_test verisinin uzunluğu
    y_pred_trimmed = y_pred[index][:len(y_test[index])]
    plt.plot(y_pred_trimmed, marker='x', label='y_pred')

    plt.xlabel('Time Step')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.title(f'Sample {index + 1}')
    plt.show()