In [1]:
#pip install tensorflow

In [77]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



In [78]:
# Load the dataset
df = pd.read_csv("air_pollution_data.csv")

# Preprocessing
# Assuming 'dt' is the timestamp column
df['dt'] = pd.to_datetime(df['dt'])
df.set_index('dt', inplace=True)


In [79]:
# Define the sequence length
sequence_length = 10

# Create overlapping sequences and corresponding target sequences
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length + 1):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length-1])  # Target is the last element of each sequence
    return np.array(X), np.array(y)

# Create input sequences and corresponding target sequences
X, y = create_sequences(df.values, sequence_length)

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X.reshape(-1, sequence_length * X.shape[2])).reshape(X.shape)
y_scaled = scaler.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential([
    LSTM(units=50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(units=50, activation='relu'),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
mse = model.evaluate(X_test, y_test)
print("Mean Squared Error:", mse)

# Make predictions
predictions = model.predict(X_test)




Epoch 1/50


  super().__init__(**kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 0.1715 - val_loss: 0.1271
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1258 - val_loss: 0.1242
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1258 - val_loss: 0.1237
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1241 - val_loss: 0.1234
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1243 - val_loss: 0.1230
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1234 - val_loss: 0.1227
Epoch 7/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1241 - val_loss: 0.1225
Epoch 8/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1241 - val_loss: 0.1223
Epoch 9/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [80]:
predictions.shape

(215, 1)

In [81]:
print("Shape of predictions:", predictions.shape)
print("Shape of original data (X):", X.shape)

Shape of predictions: (215, 1)
Shape of original data (X): (1072, 10, 9)


In [85]:

# Reshape the predictions array to match the original number of samples
predictions_reshaped = predictions.reshape(-1, 1)

# Create a DataFrame with one column named "Predicted_AQI" containing the reshaped predictions
predictions_df = (pd.DataFrame(predictions_reshaped, columns=['Predicted_AQI']).multiply(10)).astype(int)

# Print the first few rows of the predictions DataFrame
print(predictions_df.head())

   Predicted_AQI
0              3
1              2
2              4
3              1
4              3


In [89]:
# Get the maximum predicted AQI value
max_original_aqi = df['main_aqi'].max()

print("Max Original AQI:", max_original_aqi)


# Scale up the predicted AQI values
scaled_predictions = predictions_df['Predicted_AQI']* max_original_aqi 

# Round the scaled predictions and convert them to integers
predictions_df['Predicted_AQI'] = predictions_df['Predicted_AQI'].apply(lambda x: max(1, int(round(x))))

# Print the first few rows of the updated predictions DataFrame
print(predictions_df.head())


Max Original AQI: 2
   Predicted_AQI
0              3
1              2
2              4
3              1
4              3


In [90]:
print(predictions_df.head(50))

    Predicted_AQI
0               3
1               2
2               4
3               1
4               3
5               4
6               5
7               2
8               4
9               2
10              2
11              2
12              3
13              3
14              6
15              4
16              5
17              2
18              2
19              5
20              5
21              4
22              3
23              1
24              4
25              4
26              2
27              2
28              4
29              3
30              6
31              5
32              3
33              3
34              5
35              2
36              2
37              2
38              3
39              4
40              3
41              3
42              2
43              3
44              2
45              3
46              4
47              4
48              5
49              3


In [68]:
#pip install statsmodels

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [92]:
'''from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Assuming you have a time series dataset stored in a DataFrame called 'data'
# with a datetime index
# Example:
# data = pd.read_csv('your_data.csv', index_col='Date', parse_dates=True)

# 1. Preprocess the Data
# No missing values handling for simplicity
# Example: data.dropna(inplace=True)

# Assuming 'AQI' is the column you want to use as the endogenous variable
endog = train['main_aqi']

# Define SARIMA parameters
order = (1, 1, 1)  # Non-seasonal parameters
seasonal_order = (1, 1, 1, 12)  # Seasonal parameters

# Fit SARIMA model
model = SARIMAX(endog, order=order, seasonal_order=seasonal_order)
result = model.fit()
# 2. Split the Data
train_size = int(0.8 * len(df))  # 80% train, 20% test
train, test = df[:train_size], df[train_size:]

# 3. Fit the SARIMA Model
# Define SARIMA parameters (p, d, q, P, D, Q, m)
order = (1, 1, 1)  # Non-seasonal parameters
seasonal_order = (1, 1, 1, 12)  # Seasonal parameters

# Fit SARIMA model
model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
result = model.fit()

# 4. Validate the Model
# Plot ACF and PACF of residuals
residuals = result.resid
plot_acf(residuals)
plot_pacf(residuals)
plt.show()

# 5. Forecast Future Values
forecast = result.forecast(steps=len(test))  # Forecast the test set period'''

"from statsmodels.tsa.statespace.sarimax import SARIMAX\nfrom statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n\n# Assuming you have a time series dataset stored in a DataFrame called 'data'\n# with a datetime index\n# Example:\n# data = pd.read_csv('your_data.csv', index_col='Date', parse_dates=True)\n\n# 1. Preprocess the Data\n# No missing values handling for simplicity\n# Example: data.dropna(inplace=True)\n\n# Assuming 'AQI' is the column you want to use as the endogenous variable\nendog = train['main_aqi']\n\n# Define SARIMA parameters\norder = (1, 1, 1)  # Non-seasonal parameters\nseasonal_order = (1, 1, 1, 12)  # Seasonal parameters\n\n# Fit SARIMA model\nmodel = SARIMAX(endog, order=order, seasonal_order=seasonal_order)\nresult = model.fit()\n# 2. Split the Data\ntrain_size = int(0.8 * len(df))  # 80% train, 20% test\ntrain, test = df[:train_size], df[train_size:]\n\n# 3. Fit the SARIMA Model\n# Define SARIMA parameters (p, d, q, P, D, Q, m)\norder = (1, 1, 1)  # Non-