https://www.kaggle.com/datasets/borismarjanovic/price-volume-data-for-all-us-stocks-etfs

## Without parallelism

In [1]:
import pandas as pd
import numpy as np
import time
import glob
import os
import logging
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_symbol(filename):
    return os.path.splitext(os.path.basename(filename))[0].split('.')[0]

def train_and_evaluate_model(X_train, X_test, y_train, y_test, model_class, **kwargs):
    model = model_class(**kwargs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    
    return rmse, cv_rmse, model.__class__.__name__

def process_and_analyze_file(filename):
    try:
        if os.path.getsize(filename) == 0:
            logging.warning(f"Skipping empty file: {filename}")
            return None
        
        symbol = extract_symbol(filename)
        df = pd.read_csv(filename, parse_dates=['Date'], usecols=['Date', 'Close', 'High', 'Low', 'Volume'])
        
        if df.empty:
            logging.warning(f"File {filename} is empty after reading.")
            return None
        
        # Calculate daily returns
        df['Return'] = df['Close'].pct_change()
        
        # Feature engineering
        df['MA5'] = df['Close'].rolling(window=5).mean()
        df['MA20'] = df['Close'].rolling(window=20).mean()
        df['Volatility'] = df['Return'].rolling(window=20).std()
        
        # Additional features
        df['Slope'] = (df['Close'] - df['Close'].shift(5)) / 5
        df['RSI'] = calculate_rsi(df['Close'], window=14)
        df['MACD'] = calculate_macd(df['Close'])
        
        # Prepare data for ML models
        df = df.dropna()
        features = ['Close', 'High', 'Low', 'Volume', 'Return', 'MA5', 'MA20', 'Volatility', 'Slope', 'RSI', 'MACD']
        X = df[features]
        y = df['Close'].shift(-1).dropna()
        X = X[:-1]  # Remove last row to align with y
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Feature selection
        selector = SelectKBest(score_func=f_regression, k=5)
        X_train_selected = selector.fit_transform(X_train_scaled, y_train)
        X_test_selected = selector.transform(X_test_scaled)
        
        selected_features = [features[i] for i in selector.get_support(indices=True)]
        
        # Train and evaluate models
        models = [
            (RandomForestRegressor, {'n_estimators': 100, 'random_state': 42}),
            (ElasticNet, {'alpha': 1.0, 'random_state': 42}),
            (SVR, {'kernel': 'rbf'}),
            (XGBRegressor, {'n_estimators': 100, 'random_state': 42})
        ]
        
        model_performances = [
            train_and_evaluate_model(X_train_selected, X_test_selected, y_train, y_test, model_class, **params) 
            for model_class, params in models
        ]
        
        # Find best model
        best_rmse, best_cv_rmse, best_model = min(model_performances, key=lambda x: x[0])
        
        # Calculate summary statistics
        summary = {
            'symbol': symbol,
            'avg_daily_return': df['Return'].mean(),
            'volatility': df['Return'].std() * np.sqrt(252),  # Annualized volatility
            'sharpe_ratio': df['Return'].mean() / df['Return'].std() * np.sqrt(252),
            'max_drawdown': (df['Close'] / df['Close'].cummax() - 1).min(),
            'avg_volume': df['Volume'].mean(),
            'best_model': best_model,
            'best_model_rmse': best_rmse,
            'best_model_cv_rmse': best_cv_rmse,
            'selected_features': ', '.join(selected_features)
        }
        
        return pd.Series(summary)
    except Exception as e:
        logging.error(f"Error processing file {filename}: {str(e)}")
        return None

def calculate_rsi(prices, window=14):
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(prices, slow=26, fast=12, signal=9):
    ema_slow = prices.ewm(span=slow, adjust=False).mean()
    ema_fast = prices.ewm(span=fast, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    return macd - signal_line

def main():
    # Get list of filenames data/Stocks/*.txt and data/ETFs/*.txt
    file_pattern = 'data/**/*.txt'
    filenames = glob.glob(file_pattern)
    print(f"Processing {len(filenames)} files")

    # Process files serially
    results = []
    start_time = time.time()
    for i, filename in enumerate(filenames):
        result = process_and_analyze_file(filename)
        if result is not None:
            results.append(result)

    # Create a DataFrame from the results
    df_results = pd.DataFrame(results)

    # Write results to CSV
    df_results.to_csv('export/stock_analysis_results.csv', index=False)

    print("Analysis complete. Results written to export/stock_analysis_results.csv")

if __name__ == "__main__":
    main()



Processing 8539 files


2024-10-09 08:49:12,674 - ERROR - Error processing file data/Stocks/cor_a-cl.us.txt: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
2024-10-09 08:49:40,661 - ERROR - Error processing file data/Stocks/aieq.us.txt: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
2024-10-09 08:50:47,144 - ERROR - Error processing file data/Stocks/gazb.us.txt: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
2024-10-09 08:53:50,065 - ERROR - Error processing file data/Stocks/fat.us.txt: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


KeyboardInterrupt: 

 Time Complexity Analysis (O-notation):
 - Reading CSV: O(n), where n is the number of rows in the CSV file
 - Feature engineering: O(n) for each feature
 - Train-test split: O(n)
 - StandardScaler: O(n)
 - Feature selection: O(n * k), where k is the number of features
 - Model training: 
   - RandomForest: O(n_trees * n_features * n_samples * log(n_samples))
   - ElasticNet: O(n_features * n_samples)
   - SVR: O(n_samples^2 * n_features)
   - XGBoost: O(n_trees * n_features * n_samples * log(n_samples))
 - Cross-validation: O(k * n), where k is the number of folds
 - Overall time complexity: O(m * (n * log(n))), where m is the number of files and n is the number of samples in each file
 Space Complexity:
 - O(n) for storing the DataFrame and feature arrays
 - O(n) for storing the model parameters
 - Overall space complexity: O(n), where n is the number of samples in the largest file

Took 178 Minutes without paralelization