In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

def analyze_stock_feature_importance(file_path, stock_ticker):
    """
    Performs feature importance analysis for a given stock dataset,
    excluding 'High' and 'Low' prices from features.

    Args:
        file_path (str): The path to the CSV file for the stock.
        stock_ticker (str): The ticker symbol of the stock (e.g., 'TSLA').
    """
    print(f"\n--- Analyzing Feature Importance for {stock_ticker} (Excluding High/Low) ---")

    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return

    # Convert 'date' column to datetime objects and set as index
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')

    # Define the target variable
    target_variable = 'Close'
    if target_variable not in df.columns:
        print(f"Error: Target variable '{target_variable}' not found in {file_path}")
        return
    y = df[target_variable]

    # Define the features to use based on your provided list,
    # EXCLUDING 'Close' (target), 'High', and 'Low'.
    # 'price_change' is included as it's a valid predictor for 'Close'.
    features_to_use = [
        'Volume', 'price_change_pct', 'price_change', # Excluded 'Close', 'High', 'Low' here
        'MA_5', 'MA_10', 'MA_20', 'MA_50', 'MA_200',
        'RSI', 'returns',
        'volatility_5d', 'volatility_10d', 'volatility_20d',
        'momentum_5d', 'momentum_10d', 'momentum_20d',
        'volume_change', 'volume_ma_5',
        'eps', 'revenue', 'netIncomeRatio', 'PE_ratio',
        'days_since_financial_update', 'market_cap', 'price_to_sales'
    ]

    # Check if all specified features exist in the DataFrame
    missing_features = [f for f in features_to_use if f not in df.columns]
    if missing_features:
        print(f"Warning: The following features are missing from {file_path}: {missing_features}")
        # Proceed with available features, or handle as appropriate (e.g., exit)
        features_to_use = [f for f in features_to_use if f in df.columns]
        if not features_to_use:
            print("Error: No valid features to use after removing missing ones. Exiting.")
            return

    X = df[features_to_use]

    # Handle missing values (using mean imputation)
    X = X.fillna(X.mean())
    y = y.fillna(y.mean()) # Fill NaN in target as well if any

    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

    # Split the data into training and testing sets
    # Using 80/20 split. Adjust test_size if your dataset is very small.
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

    # Train a RandomForestRegressor model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Get feature importances
    importances = model.feature_importances_

    # Create a DataFrame for feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

    # Sort by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Display the feature importances as a list
    print(f"\nFeature Importances for {stock_ticker} (Target: {target_variable}, Excluding High/Low):")
    print(feature_importance_df)

    # Plotting the feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
    plt.title(f'Feature Importance for {stock_ticker} (Target: {target_variable}, Excluding High/Low)')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()

    # Save the plot image file
    output_filename = f'{stock_ticker}_feature_importance_plot_no_high_low.png'
    plt.savefig(output_filename)
    print(f"\nFeature importance plot saved as '{output_filename}'")
    plt.close() # Close the plot to free memory

    return feature_importance_df

# --- Example Usage ---
if __name__ == "__main__":
    # Path to your TSLA dataset (assuming it's in the same directory)
    tsla_file = 'TSLA_merged_dataset_TSLA.csv'
    analyze_stock_feature_importance(tsla_file, 'TSLA')

    # To analyze other stocks, uncomment and modify the lines below.
    # Make sure you have the CSV files in the same directory as this script,
    # or provide the full path to the files.

    # other_stock_files = {
    #     'AAPL': 'AAPL_merged_dataset_AAPL.csv',
    #     'MSFT': 'MSFT_merged_dataset_MSFT.csv',
    #     'GOOG': 'GOOG_merged_dataset_GOOG.csv',
    #     # Add paths for your other 7 stock datasets here
    # }

    # for ticker, file_path in other_stock_files.items():
    #     analyze_stock_feature_importance(file_path, ticker)

    print("\nScript finished.")



--- Analyzing Feature Importance for TSLA (Excluding High/Low) ---

Feature Importances for TSLA (Target: Close, Excluding High/Low):
                        Feature  Importance
23                   market_cap    0.955180
3                          MA_5    0.032001
24               price_to_sales    0.006159
21                     PE_ratio    0.004469
15                 momentum_20d    0.000353
7                        MA_200    0.000183
17                  volume_ma_5    0.000164
0                        Volume    0.000135
22  days_since_financial_update    0.000134
11               volatility_10d    0.000132
8                           RSI    0.000122
16                volume_change    0.000122
5                         MA_20    0.000091
4                         MA_10    0.000078
19                      revenue    0.000077
14                 momentum_10d    0.000076
12               volatility_20d    0.000073
1              price_change_pct    0.000068
10                volatility_