# Feature engineering, rules-based models and ensembles 

In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

### Import of clean data created in the first notebook

In [25]:
data = pd.read_csv('data/clean_data.csv')
data = data.drop(data.columns[0], axis=1)

In [23]:
def cross_validate_models(data):
    labels = data['Target']
    cvm_data = data.drop(['Target', 'Date', 'Symbol', 'Id'], axis=1)
    models = {
        'GNB': GaussianNB(),
        'LR': LogisticRegression(),
        'RFC': RandomForestClassifier(),
        'ETC': ExtraTreesClassifier(),
        'XGB': XGBClassifier()
    }

    folds = 5
    kf = KFold(n_splits=folds)
    
    print("Algorithm\t\t", "\t".join([f"Fold {i+1}" for i in range(folds)]), "\tAverage")
    for model_name, model in models.items():
        scores = []
        for train_index, test_index in kf.split(cvm_data):
            X_train, X_test = cvm_data.iloc[train_index], cvm_data.iloc[test_index]
            y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = f1_score(y_test, y_pred)
            scores.append(score)
    
        avg_score = np.mean(scores)
        print(f"{model_name}\t", "\t".join([f"{score:.2f}" for score in scores]), f"\t{avg_score:.2f}")
        
#cross_validate_models(data)

## Feature creation

### History related features

1. **Lagged Returns (Lagged_Return_1, Lagged_Return_5):**
   - These features capture the previous day's (Lagged_Return_1) and the return of the stock price five days ago (Lagged_Return_5).
   - `shift(1)` and `shift(5)` shift the 'Close' price downward by one and five rows, respectively, effectively capturing the past returns.

2. **Rolling Statistics (Rolling_Mean_5, Rolling_Std_5, Rolling_Min_5, Rolling_Max_5):**
   - These features calculate rolling statistics over a 5-day window for each stock symbol.
   - `x.rolling(window=5).mean()`, `x.rolling(window=5).std()`, `x.rolling(window=5).min()`, and `x.rolling(window=5).max()` compute the rolling mean, standard deviation, minimum, and maximum of the 'Close' price, respectively.

3. **Price Oscillations (Price_Oscillation):**
   - This feature computes the price oscillation, which is the ratio of the difference between the high and low prices to the closing price.
   - It provides insights into intraday price volatility and trading range.

4. **Volume Accumulation (Volume_Accumulation):**
   - This feature calculates the cumulative volume traded for each stock symbol over time.
   - `x.expanding().sum()` computes the expanding sum of the 'Volume' column, capturing the total volume accumulated until each point in time.

5. **Price to Moving Average Ratio (Price_to_SMA_20_Ratio):**
   - This feature calculates the ratio of the closing price to the 20-day Simple Moving Average (SMA).
   - It indicates whether the current price is above or below the average price trend over the specified period.

6. **Price Rate of Change (ROC):**
   - This feature computes the rate of change of the closing price over a 5-day period.
   - `x.pct_change(periods=5)` calculates the percentage change in the closing price relative to its value five days ago.

7. **Volatility Measures (Volatility_10):**
   - This feature computes the volatility of the stock price over a 10-day window.
   - It measures the standard deviation of daily percentage changes in the closing price.

8. **Price Crossings (Price_Above_SMA_50):**
   - This binary feature indicates whether the closing price is above the 50-day Simple Moving Average (SMA_50).
   - It can signal potential trend changes or breakout points.

9. **Relative Price Strength (Relative_Price_Strength):**
   - This feature calculates the ratio of the stock price to the average closing price of the market index on each date.
   - It measures the stock's relative performance compared to the broader market.

10. **Trading Volume Momentum (Volume_Momentum):**
   - This feature computes the momentum of trading volume over a 5-day period.
   - It measures the percentage change in trading volume relative to its value five days ago.

These features capture various aspects of historical price movements, volatility, and trading activity, providing valuable information for predictive modeling and trading strategies.

In [26]:
data['SMA_50'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=50).mean())
data['EMA_20'] = data.groupby('Symbol')['Close'].transform(lambda x: x.ewm(span=20, adjust=False).mean())

data['Lagged_Return_1'] = data.groupby('Symbol')['Close'].shift(1)
data['Lagged_Return_5'] = data.groupby('Symbol')['Close'].shift(5)

data['Rolling_Mean_5'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=5).mean())
data['Rolling_Std_5'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=5).std())
data['Rolling_Min_5'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=5).min())
data['Rolling_Max_5'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=5).max())

data['Price_Oscillation'] = (data['High'] - data['Low']) / data['Close']

data['Volume_Accumulation'] = data.groupby('Symbol')['Volume'].transform(lambda x: x.expanding().sum())

# Calculate SMA_20 before using it
data['SMA_20'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=20).mean())

# Now calculate Price_to_SMA_20_Ratio
data['Price_to_SMA_20_Ratio'] = data['Close'] / data['SMA_20']

data['ROC'] = data.groupby('Symbol')['Close'].pct_change(periods=5)
data['Volatility_10'] = data.groupby('Symbol')['Close'].pct_change().rolling(window=10).std()
data['Price_Above_SMA_50'] = (data['Close'] > data['SMA_50']).astype(int)

market_index_data = data.groupby('Date')['Close'].mean().rename('Market_Index').reset_index()
data = pd.merge(data, market_index_data, on='Date', how='left')
data['Relative_Price_Strength'] = data['Close'] / data['Market_Index']
data['Volume_Momentum'] = data.groupby('Symbol')['Volume'].pct_change(periods=5)

### Domain specific features

1. **Simple Moving Average (SMA_50):**
   - This line calculates the 50-day Simple Moving Average (SMA) for each stock symbol in the dataset. 
   - `data.groupby('Symbol')['Close']` groups the data by the 'Symbol' column and selects the 'Close' price for each group.
   - `.transform(lambda x: x.rolling(window=50).mean())` calculates the rolling mean (average) over a window of 50 periods for each group, which represents the 50-day SMA.

2. **Exponential Moving Average (EMA_20):**
   - This line calculates the 20-day Exponential Moving Average (EMA) for each stock symbol.
   - Similar to SMA, it calculates the rolling mean, but EMA gives more weight to recent prices.
   - `x.ewm(span=20, adjust=False).mean()` calculates the EMA with a span of 20 periods for each group.

3. **Relative Strength Index (RSI):**
   - RSI is a momentum oscillator that measures the speed and change of price movements.
   - The `calculate_rsi()` function calculates the RSI for each stock symbol using the closing prices and a specified window (default is 14 periods).
   - It calculates the average gain and loss over the specified window, computes the relative strength (RS), and then calculates the RSI using a formula.
   - `data.groupby('Symbol').apply(calculate_rsi)` applies this function to each group of data grouped by 'Symbol'.

4. **Bollinger Bands (SMA_20, std_20, upper_band, lower_band):**
   - Bollinger Bands are volatility bands placed above and below a moving average.
   - `SMA_20` calculates the 20-day SMA for each symbol.
   - `std_20` calculates the standard deviation of closing prices over a 20-day window for each symbol.
   - `upper_band` and `lower_band` are calculated by adding/subtracting two times the standard deviation from the SMA, providing a measure of volatility around the average.

5. **Volume Weighted Average Price (VWAP):**
   - VWAP is a trading benchmark used by traders that gives the average price a security has traded at throughout the day, based on both volume and price.
   - This line calculates VWAP by summing up the product of volume and average price (computed as the average of high, low, and close prices) over time, and dividing by the cumulative volume.

6. **Moving Average Convergence Divergence (MACD):**
   - MACD is a trend-following momentum indicator that shows the relationship between two moving averages of a security’s price.
   - `EMA_12` and `EMA_26` calculate the 12-day and 26-day EMA for each symbol, respectively.
   - `MACD` is computed as the difference between the 12-day and 26-day EMAs.

These features capture various aspects of price trends, volatility, and momentum, providing valuable insights for technical analysis in stock trading.

In [28]:
def calculate_rsi(close_prices, window=14):
    delta = close_prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

data['RSI'] = data.groupby('Symbol')['Close'].transform(lambda x: calculate_rsi(x))

data['SMA_20'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=20).mean())
data['std_20'] = data.groupby('Symbol')['Close'].transform(lambda x: x.rolling(window=20).std())
data['upper_band'] = data['SMA_20'] + (2 * data['std_20'])
data['lower_band'] = data['SMA_20'] - (2 * data['std_20'])

data['VWAP'] = (data['Volume'] * (data['High'] + data['Low'] + data['Close']) / 3).cumsum() / data['Volume'].cumsum()

data['EMA_12'] = data.groupby('Symbol')['Close'].transform(lambda x: x.ewm(span=12, adjust=False).mean())
data['EMA_26'] = data.groupby('Symbol')['Close'].transform(lambda x: x.ewm(span=26, adjust=False).mean())
data['MACD'] = data['EMA_12'] - data['EMA_26']

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume,Target,Id,...,Relative_Price_Strength,Volume_Momentum,RSI,std_20,upper_band,lower_band,VWAP,EMA_12,EMA_26,MACD
0,2010-01-04,MMM,53.29538,83.019997,83.449997,82.669998,83.089996,3043700.0,0,0,...,2.109333,,,,,,83.046664,83.019997,83.019997,0.0
1,2010-01-04,CHRW,42.59129,59.34,59.650002,58.939999,59.220001,822900.0,1,87745,...,1.507683,,,,,,77.994965,59.34,59.34,0.0
2,2010-01-04,STLD,13.078124,18.290001,18.5,18.040001,18.17,3691600.0,0,416162,...,0.464704,,,,,,48.827165,18.290001,18.290001,0.0
3,2010-01-04,CAH,22.306038,32.529999,32.66,31.9,32.240002,3824400.0,1,82731,...,0.826507,,,,,,43.295539,32.529999,32.529999,0.0
4,2010-01-04,TGT,35.269444,48.549999,48.889999,48.400002,48.470001,4589100.0,1,423683,...,1.233536,,,,,,44.823485,48.549999,48.549999,0.0
