In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, ridge_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from skleran.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
import warnings

warnings.filterwarnings("ignore")
#pip install xgboost
# Load the time series data
df = pd.read_csv(r"C:\Users\nagashree k d\Desktop\airline-passengers.csv", header=0, parse_dates=[0], index_col=0)

# Function to create lagged features
def create_features(df, lags):
 for lag in range(1, lags + 1):
 df[f'lag_{lag}'] = df['Passengers'].shift(lag)
 df.dropna(inplace=True)
 return df

# Preprocess the data
df = create_features(df, lags=5)
X = df.drop(columns=['Passengers'])
y = df['Passengers']

# Train-test split
tscv = TimeSeriesSplit(n_splits=5)

# Model evaluation function for machine learning models
def evaluate_model(model, X_train, X_test, y_train, y_test):
 model.fit(X_train, y_train)
 y_pred = model.predict(X_test)
 mse = mean_squared_error(y_test, y_pred)
 mae = mean_absolute_error(y_test, y_pred)
 return mse, mae, y_pred

# Function to evaluate the ARIMA model
def evaluate_arima(train, test):
 history = [x for x in train['Passengers']]
 predictions = list()
 for t in range(len(test)):
 model = ARIMA(history, order=(5, 1, 0))
 model_fit = model.fit(disp=0)
 yhat = model_fit.forecast()[0]
 predictions.append(yhat)
 history.append(test['Passengers'].iloc[t])
 mse = mean_squared_error(test['Passengers'], predictions)
 mae = mean_absolute_error(test['Passengers'], predictions)
 return mse, mae, predictions

# Define models
models = {
 'Linear Regression': LinearRegression(),
 'Logistic Regression' : LogisticRegression(),
 'Lass0' : Lasso(),
 'Ridge Regression' :  ridge_regression(),
 'Decision Tree': DecisionTreeRegressor,
 'Random Forest': RandomForestRegressor(),
 'Gradient BOOSTING' : GradientBoostingRegressor(),
 'Support Vector Regressor': SVR(),
 'KNN' :KNeighborsRegressor(),
 'XGBoost': xgb.XGBRegressor(objective='reg:squarederror'),
 'ARIMA': evaluate_arima
}

results = {}

# Evaluate each model
for train_index, test_index in tscv.split(X):
 X_train, X_test = X.iloc[train_index], X.iloc[test_index]
 y_train, y_test = y.iloc[train_index], y.iloc[test_index]

 for name, model in models.items():
 try:
 if name == 'ARIMA':
 train, test = df.iloc[train_index], df.iloc[test_index]
 mse, mae, predictions = model(train, test)
 else:
 mse, mae, predictions = evaluate_model(model, X_train, X_test, y_train, y_test)
 if name not in results:
 results[name] = {'MSE': [], 'MAE': [], 'predictions': []}
 results[name]['MSE'].append(mse)
 results[name]['MAE'].append(mae)
 results[name]['predictions'].extend(predictions)
 print(f'{name} - MSE: {mse}, MAE: {mae}')
 except Exception as e:
 print(f'{name} model failed: {e}')
 if name not in results:
 results[name] = {'MSE': [], 'MAE': [], 'predictions': []}
 results[name]['MSE'].append(float('inf'))
 results[name]['MAE'].append(float('inf'))
 results[name]['predictions'].extend([float('inf')] * len(test))

# Compute average metrics for each model
for name in results:
 results[name]['MSE'] = np.mean(results[name]['MSE'])
 results[name]['MAE'] = np.mean(results[name]['MAE'])

# Select the best model
best_model_name = min(results.items(), key=lambda x: x[1]['MSE'])[0]
best_model_predictions = results[best_model_name]['predictions']

print(f'Best Model: {best_model_name}')
print(f'MSE: {results[best_model_name]["MSE"]}')
print(f'MAE: {results[best_model_name]["MAE"]}')

# Anomaly detection using the best model's predictions
y = df['Passengers']
residuals = y[len(y) - len(best_model_predictions):] - best_model_predictions
std_res = np.std(residuals)
anomalies = df[len(df) - len(best_model_predictions):][abs(residuals) > 2 * std_res]

# Plot results
plt.figure(figsize=(14, 7))
plt.plot(df.index, y, label='True values')
plt.plot(df.index[len(df) - len(best_model_predictions):], best_model_predictions, label='Predicted values', alpha=0.7)
plt.scatter(anomalies.index, anomalies['Passengers'], color='red', label='Anomalies')
plt.legend()
plt.show()

IndentationError: expected an indented block after 'for' statement on line 23 (3233502113.py, line 24)