In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings

# Add src directory to path
sys.path.append('../src')

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load and Explore Raw Data

In [None]:
# Load datasets
data_df = pd.read_csv('../data/data.csv')
stock_price_df = pd.read_csv('../data/stock_price.csv')

print("Data Dataset Shape:", data_df.shape)
print("\nData Dataset Head:")
print(data_df.head())

print("\n" + "="*60)
print("Stock Price Dataset Shape:", stock_price_df.shape)
print("\nStock Price Dataset Head:")
print(stock_price_df.head())

## 2. Data Statistics and Information

In [None]:
# Convert Date columns to datetime
data_df['Date'] = pd.to_datetime(data_df['Date'])
stock_price_df['Date'] = pd.to_datetime(stock_price_df['Date'])

print("Data Dataset Info:")
print(data_df.info())

print("\n" + "="*60)
print("\nData Dataset Statistics:")
print(data_df.describe())

print("\n" + "="*60)
print("\nStock Price Statistics:")
print(stock_price_df.describe())

## 3. Visualize Raw Data Trends

In [None]:
# Plot stock price over time
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Stock price trend
axes[0].plot(stock_price_df['Date'], stock_price_df['Stock_Price'], 
             marker='o', markersize=3, linewidth=1.5, color='darkblue')
axes[0].set_title('Stock Price Over Time', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Stock Price ($)')
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Select some key features to visualize
key_features = ['Volume', 'Market_Sentiment', 'Volatility_Index']
for feature in key_features:
    axes[1].plot(data_df['Date'], data_df[feature], 
                marker='o', markersize=2, linewidth=1, label=feature, alpha=0.7)

axes[1].set_title('Selected Features Over Time', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Normalized Value')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Run Preprocessing Pipeline

In [None]:
from preprocess import preprocess_pipeline

# Run preprocessing
merged_df = preprocess_pipeline('../data/data.csv', '../data/stock_price.csv')

print("\nMerged Dataset:")
print(merged_df.head(10))
print(f"\nMerged dataset shape: {merged_df.shape}")

## 5. Feature Engineering - Compute Day-over-Day Changes

In [None]:
from feature_engineering import feature_engineering_pipeline

# Run feature engineering
df_full, X, y, feature_names = feature_engineering_pipeline(merged_df)

print("Feature Matrix (X) - Day-over-Day Changes:")
print(X.head())

print("\n" + "="*60)
print("\nTarget Vector (y) - Next Day Stock Price:")
print(y.head())

print("\n" + "="*60)
print(f"\nTotal samples: {len(X)}")
print(f"Number of features: {len(feature_names)}")
print(f"\nFeature names:")
for i, name in enumerate(feature_names, 1):
    print(f"  {i}. {name}")

## 6. Visualize Feature Changes Distribution

In [None]:
# Plot distributions of feature changes
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, feature in enumerate(feature_names[:6]):
    axes[i].hist(X[feature], bins=20, edgecolor='black', alpha=0.7, color='steelblue')
    axes[i].set_title(feature, fontsize=10, fontweight='bold')
    axes[i].set_xlabel('Change Value')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Distribution of Day-over-Day Feature Changes', 
             fontsize=14, fontweight='bold', y=1.02)
plt.show()

## 7. Correlation Analysis

In [None]:
# Create correlation matrix
correlation_df = X.copy()
correlation_df['next_day_price'] = y

correlation_matrix = correlation_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5)
plt.title('Correlation Matrix: Feature Changes vs Next Day Stock Price', 
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Show correlations with target
print("\nCorrelations with Next Day Stock Price:")
target_corr = correlation_matrix['next_day_price'].drop('next_day_price').sort_values(ascending=False)
print(target_corr)

## 8. Train Models

In [None]:
from train import training_pipeline

# Train models
results = training_pipeline(X, y, feature_names, test_size=0.2)

print("\n" + "="*60)
print("Models trained successfully!")
print("="*60)

## 9. Evaluate Models

In [None]:
from evaluate import evaluation_pipeline

# Evaluate models
evaluation_pipeline(results)

## 10. Visualize Predictions on Test Set

In [None]:
# Get predictions
lr_pred = results['lr_model'].predict(results['X_test'])
rf_pred = results['rf_model'].predict(results['X_test'])
y_test = results['y_test']

# Create comparison plot
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Linear Regression
axes[0].plot(range(len(y_test)), y_test.values, 
             label='Actual', marker='o', markersize=5, linewidth=2, alpha=0.7)
axes[0].plot(range(len(lr_pred)), lr_pred, 
             label='Predicted', marker='x', markersize=5, linewidth=2, alpha=0.7)
axes[0].set_title('Linear Regression: Actual vs Predicted', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Test Sample Index')
axes[0].set_ylabel('Stock Price ($)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Random Forest
axes[1].plot(range(len(y_test)), y_test.values, 
             label='Actual', marker='o', markersize=5, linewidth=2, alpha=0.7)
axes[1].plot(range(len(rf_pred)), rf_pred, 
             label='Predicted', marker='x', markersize=5, linewidth=2, alpha=0.7)
axes[1].set_title('Random Forest: Actual vs Predicted', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Test Sample Index')
axes[1].set_ylabel('Stock Price ($)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Key Insights and Interpretation

### Linear Regression Coefficients
The coefficients tell us how changes in features influence next day's stock price:

In [None]:
# Display top coefficients
coef_df = results['coef_df']
print("Top 5 Most Influential Features (Linear Regression):")
print(coef_df[['Feature', 'Coefficient']].head())

print("\n" + "="*60)
print("\nInterpretation:")
print("- Positive coefficient → Feature increase leads to price increase")
print("- Negative coefficient → Feature increase leads to price decrease")
print("- Larger magnitude → Stronger influence")

### Random Forest Feature Importances
Feature importance shows which day-over-day changes are most predictive:

In [None]:
# Display top feature importances
importance_df = results['importance_df']
print("Top 5 Most Important Features (Random Forest):")
print(importance_df[['Feature', 'Importance']].head())

print("\n" + "="*60)
print("\nInterpretation:")
print("- Higher importance → Feature contributes more to prediction accuracy")
print("- Random Forest captures non-linear relationships")

## 12. Conclusions

### Model Performance Summary:
1. **Linear Regression**: Provides interpretable coefficients showing direct relationships
2. **Random Forest**: Captures complex non-linear patterns in the data

### Key Findings:
- Day-over-day changes in market features successfully predict next day stock prices
- The temporal alignment (features at day t → price at day t+1) is appropriate
- No data shuffling ensures time series integrity

### Recommendations:
- Use Random Forest for better accuracy (if that's the case)
- Use Linear Regression for interpretability and understanding relationships
- Monitor prediction errors and retrain periodically with new data