In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

# Load the datasets
historical_weather = pd.read_csv('../artfacts/historical_weather.csv')
sample_submission = pd.read_csv('../artfacts/sample_submission.csv')
submission_key = pd.read_csv('../artfacts/submission_key.csv')

# Handle missing values using interpolation
historical_weather.interpolate(method='linear', inplace=True)

# Generate date-based features
historical_weather['date'] = pd.to_datetime(historical_weather['date'])
historical_weather['year'] = historical_weather['date'].dt.year
historical_weather['month'] = historical_weather['date'].dt.month
historical_weather['day'] = historical_weather['date'].dt.day
historical_weather['day_of_week'] = historical_weather['date'].dt.dayofweek

# Normalize the temperature data
scaler = MinMaxScaler(feature_range=(0, 1))
historical_weather[['avg_temp_c']] = scaler.fit_transform(historical_weather[['avg_temp_c']])

# Function to create sequences of data for LSTM input
def create_sequences(data, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i+seq_length])
        targets.append(data[i+seq_length])
    return np.array(sequences), np.array(targets)

# Prepare the data for LSTM model
sequence_length = 30
unique_city_ids = historical_weather['city_id'].unique()

# Create a DataFrame to store the predictions
predictions_df = pd.DataFrame(columns=['submission_ID', 'avg_temp_c'])

# Iterate through each city
for city_id in unique_city_ids:
    city_data = historical_weather[historical_weather['city_id'] == city_id]['avg_temp_c'].values

    if len(city_data) < sequence_length:
        print(f"Not enough data points to create sequences for city {city_id}.")
        continue

    X, y = create_sequences(city_data, sequence_length)

    # Split into training and test sets
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Reshape the input to be 3D (samples, time steps, features)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(sequence_length, 1)))
    model.add(LSTM(units=50))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    early_stop = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stop])

    # Prepare predictions for the first week of 2019
    first_week_2019_dates = pd.date_range(start='2019-01-01', end='2019-01-07')
    first_week_predictions = []

    # Use the last sequence from the training data to start the prediction
    last_sequence = city_data[-sequence_length:]

    for date in first_week_2019_dates:
        # Predict the next value
        last_sequence_reshaped = np.reshape(last_sequence, (1, sequence_length, 1))
        next_prediction = model.predict(last_sequence_reshaped)
        next_prediction_rescaled = scaler.inverse_transform(next_prediction)
        first_week_predictions.append(next_prediction_rescaled[0][0])
        
        # Update the sequence with the new prediction
        last_sequence = np.append(last_sequence[1:], next_prediction)

    # Create a DataFrame for the city's predictions
    city_submission_key = submission_key[submission_key['city_id'] == city_id].copy()
    city_submission_key['avg_temp_c'] = first_week_predictions

    # Append the city's predictions to the overall predictions DataFrame
    predictions_df = pd.concat([predictions_df, city_submission_key[['submission_ID', 'avg_temp_c']]], ignore_index=True)

# Merge predictions with sample_submission to create the final submission file
final_submission = sample_submission.drop(columns=['avg_temp_c']).merge(predictions_df, on='submission_ID', how='left')

# Save to CSV
final_submission.to_csv('../artifacts/sample_submission.csv', index=False)
print("Submission file saved as sample_submission.csv")