In [14]:
nasdaq100_tickers = [
    "AAPL", "ABNB", "ADBE", "ADI", "ADP", "ADSK", "AEP", "AMD", "AMAT", "AMGN",
    "AMZN", "APP", "ARM", "ASML", "AVGO", "AXON", "AZN", "BIIB", "BKNG", "BKR",
    "CCEP", "CDNS", "CDW", "CEG", "CHTR", "CMCSA", "COST", "CPRT", "CRWD", "CSCO",
    "CSGP", "CSX", "CTAS", "CTSH", "DASH", "DDOG", "DXCM", "EA", "EXC", "FANG",
    "FAST", "FTNT", "GEHC", "GFS", "GILD", "GOOG", "GOOGL", "HON", "IDXX", "INTC",
    "INTU", "ISRG", "KDP", "KHC", "KLAC", "LIN", "LRCX", "LULU", "MAR", "MCHP",
    "MDLZ", "MELI", "META", "MNST", "PEP", "PLTR", "PYPL", "QCOM", "REGN", "ROP",
    "ROST", "SBUX", "SHOP", "SNPS", "TEAM", "TMUS", "TSLA", "TTD", "TTWO", "TXN",
    "VRSK", "VRTX", "WBD", "WDAY", "XEL", "ZS"
]

# imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Now, I am curious about Cohen's statement regarding that:
- A degree 4 polynomial is best
- A 60 day window is best
- A band width of 2 is best

I am curious to find out if this is correct, by testing every possibility. 

In [15]:
# Parameter Grid Search
from utils import run_full_analysis, calculate_metrics
import warnings
warnings.filterwarnings('ignore')

# Define parameter ranges
degrees = range(1, 10)
windows = range(20, 121, 10)
bandwidths = np.arange(0.5, 4.1, 0.1)

# Directory for stock data
data_dir = "nasdaq100_data"


In [16]:
# Run grid search and save all data (with resume capability)
results = []

# Create main output directory
output_dir = "Analysis_Data"
os.makedirs(output_dir, exist_ok=True)

total_iterations = len(list(degrees)) * len(list(windows)) * len(bandwidths)
current_iteration = 0
skipped_count = 0
calculated_count = 0

for degree in degrees:
    for window in windows:
        for bandwidth in bandwidths:
            current_iteration += 1
            
            # Create directory for this parameter combination
            param_dir = os.path.join(
                output_dir, 
                f"degree_{degree}",
                f"window_{window}",
                f"bandwidth_{bandwidth:.1f}"
            )
            os.makedirs(param_dir, exist_ok=True)
            
            # Check if this combination has already been calculated
            summary_path = os.path.join(param_dir, "summary.csv")
            all_metrics_path = os.path.join(param_dir, "all_metrics.csv")
            
            if os.path.exists(summary_path) and os.path.exists(all_metrics_path):
                # Load existing results
                try:
                    summary_df = pd.read_csv(summary_path)
                    results.append({
                        'degree': summary_df.iloc[0]['degree'],
                        'window': summary_df.iloc[0]['window'],
                        'bandwidth': summary_df.iloc[0]['bandwidth'],
                        'avg_return': summary_df.iloc[0]['avg_return'],
                        'num_stocks': summary_df.iloc[0]['num_stocks']
                    })
                    skipped_count += 1
                    
                    if current_iteration % 100 == 0:
                        print(f"Progress: {current_iteration}/{total_iterations} ({100*current_iteration/total_iterations:.1f}%) - Loaded cached result")
                    continue
                except Exception as e:
                    print(f"Warning: Could not load cached result for {param_dir}, recalculating...")
            
            # Calculate this combination (not yet cached)
            ticker_returns = []
            all_ticker_metrics = []
            
            for ticker in nasdaq100_tickers:
                try:
                    # Load stock data
                    csv_path = os.path.join(data_dir, f"{ticker}.csv")
                    if not os.path.exists(csv_path):
                        continue
                    
                    df = pd.read_csv(csv_path, index_col=0, parse_dates=True)
                    
                    # Run analysis
                    df_analyzed = run_full_analysis(df, degree=degree, window=window, k=bandwidth)
                    
                    # Calculate metrics
                    metrics = calculate_metrics(df_analyzed)
                    metrics['ticker'] = ticker  # Add ticker to metrics
                    
                    # Store metrics for this ticker
                    all_ticker_metrics.append(metrics)
                    
                    # Store the strategy return percentage
                    ticker_returns.append(metrics['strategy_return_pct'])
                    
                except Exception as e:
                    # Skip tickers with errors
                    continue
            
            # Calculate average return across all tickers
            if ticker_returns:
                avg_return = np.mean(ticker_returns)
                
                results.append({
                    'degree': degree,
                    'window': window,
                    'bandwidth': bandwidth,
                    'avg_return': avg_return,
                    'num_stocks': len(ticker_returns)
                })
                
                # Save all metrics in one CSV
                all_metrics_df = pd.DataFrame(all_ticker_metrics)
                all_metrics_df.to_csv(all_metrics_path, index=False)
                
                # Save summary for this parameter combination
                pd.DataFrame([{
                    'degree': degree,
                    'window': window,
                    'bandwidth': bandwidth,
                    'avg_return': avg_return,
                    'num_stocks': len(ticker_returns),
                    'min_return': min(ticker_returns),
                    'max_return': max(ticker_returns),
                    'median_return': np.median(ticker_returns),
                    'std_return': np.std(ticker_returns)
                }]).to_csv(summary_path, index=False)
                
                calculated_count += 1
            
            if current_iteration % 10 == 0 and calculated_count > 0:
                print(f"Progress: {current_iteration}/{total_iterations} ({100*current_iteration/total_iterations:.1f}%) - Calculated: {calculated_count}, Cached: {skipped_count}")

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(f"\n{'='*80}")
print(f"EXPERIMENT COMPLETE")
print(f"{'='*80}")
print(f"Total combinations: {len(results_df)}")
print(f"  Newly calculated: {calculated_count}")
print(f"  Loaded from cache: {skipped_count}")
print(f"\nBest 10 parameter combinations by average return:")
print(results_df.nlargest(10, 'avg_return')[['degree', 'window', 'bandwidth', 'avg_return']])

Progress: 10/3564 (0.3%) - Calculated: 10, Cached: 0
Progress: 20/3564 (0.6%) - Calculated: 20, Cached: 0
Progress: 30/3564 (0.8%) - Calculated: 30, Cached: 0
Progress: 40/3564 (1.1%) - Calculated: 40, Cached: 0
Progress: 50/3564 (1.4%) - Calculated: 50, Cached: 0
Progress: 60/3564 (1.7%) - Calculated: 60, Cached: 0
Progress: 70/3564 (2.0%) - Calculated: 70, Cached: 0
Progress: 80/3564 (2.2%) - Calculated: 80, Cached: 0
Progress: 90/3564 (2.5%) - Calculated: 90, Cached: 0
Progress: 100/3564 (2.8%) - Calculated: 100, Cached: 0
Progress: 110/3564 (3.1%) - Calculated: 110, Cached: 0
Progress: 120/3564 (3.4%) - Calculated: 120, Cached: 0
Progress: 130/3564 (3.6%) - Calculated: 130, Cached: 0
Progress: 140/3564 (3.9%) - Calculated: 140, Cached: 0
Progress: 150/3564 (4.2%) - Calculated: 150, Cached: 0
Progress: 160/3564 (4.5%) - Calculated: 160, Cached: 0
Progress: 170/3564 (4.8%) - Calculated: 170, Cached: 0
Progress: 180/3564 (5.1%) - Calculated: 180, Cached: 0
Progress: 190/3564 (5.3%) - 

KeyboardInterrupt: 

In [None]:
# Create 3D surface plots for each degree
from scipy.interpolate import griddata

# Create output directory if it doesn't exist
output_dir = "Analysis_Data"
os.makedirs(output_dir, exist_ok=True)

# Determine grid layout
n_degrees = len(list(degrees))
n_cols = 3
n_rows = (n_degrees + n_cols - 1) // n_cols

fig = plt.figure(figsize=(20, 5 * n_rows))

for idx, degree in enumerate(degrees, 1):
    # Filter results for this degree
    degree_data = results_df[results_df['degree'] == degree]
    
    if len(degree_data) == 0:
        continue
    
    # Extract data
    x = degree_data['bandwidth'].values
    y = degree_data['window'].values
    z = degree_data['avg_return'].values
    
    # Create meshgrid for interpolation
    xi = np.linspace(x.min(), x.max(), 50)
    yi = np.linspace(y.min(), y.max(), 50)
    xi, yi = np.meshgrid(xi, yi)
    
    # Interpolate z values
    zi = griddata((x, y), z, (xi, yi), method='cubic')
    
    # Create 3D subplot
    ax = fig.add_subplot(n_rows, n_cols, idx, projection='3d')
    
    # Create surface plot
    surf = ax.plot_surface(xi, yi, zi, cmap='viridis', alpha=0.8, edgecolor='none')
    
    # Also plot the actual data points
    ax.scatter(x, y, z, c='red', marker='o', s=20, alpha=0.6)
    
    # Labels and title
    ax.set_xlabel('Bandwidth (k)', fontsize=10)
    ax.set_ylabel('Window (days)', fontsize=10)
    ax.set_zlabel('Avg Return (%)', fontsize=10)
    ax.set_title(f'Degree {degree}', fontsize=12, fontweight='bold')
    
    # Add colorbar
    fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
    
    # Find best parameters for this degree
    best_idx = degree_data['avg_return'].idxmax()
    best_params = degree_data.loc[best_idx]
    
    # Annotate best point
    ax.text2D(0.05, 0.95, 
              f"Best: W={best_params['window']:.0f}, K={best_params['bandwidth']:.1f}\nReturn={best_params['avg_return']:.2f}%",
              transform=ax.transAxes, fontsize=9, verticalalignment='top',
              bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
output_path = os.path.join(output_dir, 'parameter_optimization_3d.png')
plt.savefig(output_path, dpi=150, bbox_inches='tight')
plt.show()

print(f"3D plots saved as '{output_path}'")

In [None]:
# Summary Analysis and Comparison to Cohen's Parameters
print("="*80)
print("PARAMETER OPTIMIZATION SUMMARY")
print("="*80)

# Overall best parameters
best_overall = results_df.loc[results_df['avg_return'].idxmax()]
print(f"\nOverall Best Parameters:")
print(f"  Degree: {best_overall['degree']:.0f}")
print(f"  Window: {best_overall['window']:.0f} days")
print(f"  Bandwidth: {best_overall['bandwidth']:.1f}")
print(f"  Average Return: {best_overall['avg_return']:.2f}%")
print(f"  Tested on {best_overall['num_stocks']:.0f} stocks")

# Cohen's recommended parameters
cohen_params = results_df[
    (results_df['degree'] == 4) & 
    (results_df['window'] == 60) & 
    (results_df['bandwidth'] == 2.0)
]

print(f"\nCohen's Recommended Parameters (Degree=4, Window=60, Bandwidth=2):")
if len(cohen_params) > 0:
    cohen_return = cohen_params.iloc[0]['avg_return']
    print(f"  Average Return: {cohen_return:.2f}%")
    print(f"  Rank: {(results_df['avg_return'] > cohen_return).sum() + 1} out of {len(results_df)}")
    percentile = 100 * (results_df['avg_return'] <= cohen_return).sum() / len(results_df)
    print(f"  Percentile: {percentile:.1f}%")
else:
    print("  Not tested in this grid search")

# Best parameters by degree
print(f"\nBest Parameters by Degree:")
print("-" * 80)
for degree in degrees:
    degree_data = results_df[results_df['degree'] == degree]
    if len(degree_data) > 0:
        best = degree_data.loc[degree_data['avg_return'].idxmax()]
        print(f"  Degree {degree}: Window={best['window']:.0f}, Bandwidth={best['bandwidth']:.1f}, Return={best['avg_return']:.2f}%")

# Create a heatmap for degree=4 (Cohen's degree)
print("\n" + "="*80)
print("Creating heatmap for Degree 4 (Cohen's recommended degree)...")

degree_4_data = results_df[results_df['degree'] == 4].copy()

# Pivot the data for heatmap
heatmap_data = degree_4_data.pivot(index='window', columns='bandwidth', values='avg_return')

plt.figure(figsize=(12, 8))
im = plt.imshow(heatmap_data.values, cmap='RdYlGn', aspect='auto')

# Set ticks and labels
plt.xticks(range(len(heatmap_data.columns)), heatmap_data.columns)
plt.yticks(range(len(heatmap_data.index)), heatmap_data.index)
plt.xlabel('Bandwidth (k)', fontsize=12)
plt.ylabel('Window (days)', fontsize=12)
plt.title('Average Return Heatmap for Degree 4 Polynomial', fontsize=14, fontweight='bold')

# Add colorbar
cbar = plt.colorbar(im)
cbar.set_label('Average Return (%)', rotation=270, labelpad=20)

# Add text annotations
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        value = heatmap_data.iloc[i, j]
        if not np.isnan(value):
            color = 'white' if value < heatmap_data.values[~np.isnan(heatmap_data.values)].mean() else 'black'
            plt.text(j, i, f'{value:.1f}', ha='center', va='center', color=color, fontsize=8)

# Highlight Cohen's parameters (window=60, bandwidth=2)
if 60 in heatmap_data.index and 2.0 in heatmap_data.columns:
    cohen_i = list(heatmap_data.index).index(60)
    cohen_j = list(heatmap_data.columns).index(2.0)
    plt.plot(cohen_j, cohen_i, 'b*', markersize=20, markeredgewidth=2, markeredgecolor='blue', 
             markerfacecolor='none', label="Cohen's params")
    plt.legend()

plt.tight_layout()
heatmap_path = os.path.join(output_dir, 'degree_4_heatmap.png')
plt.savefig(heatmap_path, dpi=150, bbox_inches='tight')
plt.show()

print(f"Heatmap saved as '{heatmap_path}'")

In [None]:
# Save results to CSV for further analysis
csv_path = os.path.join(output_dir, 'parameter_optimization_results.csv')
results_df.to_csv(csv_path, index=False)
print(f"Results saved to '{csv_path}'")

# Display summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"\nTotal parameter combinations tested: {len(results_df)}")
print(f"\nAverage Return Statistics:")
print(f"  Mean: {results_df['avg_return'].mean():.2f}%")
print(f"  Median: {results_df['avg_return'].median():.2f}%")
print(f"  Std Dev: {results_df['avg_return'].std():.2f}%")
print(f"  Min: {results_df['avg_return'].min():.2f}%")
print(f"  Max: {results_df['avg_return'].max():.2f}%")

In [None]:
# Create a comprehensive index file
print("\nCreating index file...")

index_data = []

for degree in degrees:
    for window in windows:
        for bandwidth in bandwidths:
            param_dir = os.path.join(
                output_dir, 
                f"degree_{degree}",
                f"window_{window}",
                f"bandwidth_{bandwidth:.1f}"
            )
            
            summary_path = os.path.join(param_dir, "summary.csv")
            
            if os.path.exists(summary_path):
                summary_df = pd.read_csv(summary_path)
                summary_dict = summary_df.iloc[0].to_dict()
                summary_dict['directory'] = param_dir
                index_data.append(summary_dict)

index_df = pd.DataFrame(index_data)
index_path = os.path.join(output_dir, 'master_index.csv')
index_df.to_csv(index_path, index=False)

print(f"Master index saved to '{index_path}'")
print(f"\nTotal data files created: {len(nasdaq100_tickers) * len(index_df) * 2}")  # 2 files per ticker (analysis + metrics)
print(f"Total directories created: {len(index_df)}")
print(f"Total summary files: {len(index_df)}")
print(f"\nAll analysis data is organized in '{output_dir}/' directory structure:")
print("  Analysis_Data/")
print("    ├── degree_X/")
print("    │   ├── window_Y/")
print("    │   │   ├── bandwidth_Z/")
print("    │   │   │   ├── TICKER_analysis.csv (full data with bands, signals, positions)")
print("    │   │   │   ├── TICKER_metrics.csv (calculated metrics)")
print("    │   │   │   └── summary.csv (aggregated stats for this parameter combo)")
print("    ├── master_index.csv (searchable index of all combinations)")
print("    ├── parameter_optimization_results.csv")
print("    ├── parameter_optimization_3d.png")
print("    └── degree_4_heatmap.png")

## How to Access Saved Data

All analysis results are now saved in the `Analysis_Data/` directory. You can access any specific parameter combination without re-running the entire experiment.

### Example: Load results for specific parameters

```python
# Example: Load analysis for AAPL with degree=4, window=60, bandwidth=2.0
degree, window, bandwidth = 4, 60, 2.0
ticker = "AAPL"

# Construct path
param_dir = os.path.join("Analysis_Data", f"degree_{degree}", f"window_{window}", f"bandwidth_{bandwidth:.1f}")

# Load full analysis data
aapl_analysis = pd.read_csv(os.path.join(param_dir, f"{ticker}_analysis.csv"), index_col=0, parse_dates=True)

# Load metrics
aapl_metrics = pd.read_csv(os.path.join(param_dir, f"{ticker}_metrics.csv"))

# Load summary for this parameter combination
combo_summary = pd.read_csv(os.path.join(param_dir, "summary.csv"))
```

### Use the master index to find best combinations

```python
# Load master index
index = pd.read_csv("Analysis_Data/master_index.csv")

# Find best parameter combinations
top_10 = index.nlargest(10, 'avg_return')

# Find all combinations with degree=4
degree_4_results = index[index['degree'] == 4]
```