In [3]:
data_folder = "data"  
file = "AZN.L_historical.csv"  

df = pd.read_csv(os.path.join(data_folder, file))
print(df.head())  


        Price            Close               High                Low  \
0      Ticker            AZN.L              AZN.L              AZN.L   
1        Date              NaN                NaN                NaN   
2  2020-01-02    6785.84765625  6794.706465200718  6701.688971218179   
3  2020-01-03  6821.2822265625    6821.2822265625  6715.791470659677   
4  2020-01-06   6753.068359375  6803.563557654467  6684.855547664141   

                Open   Volume  
0              AZN.L    AZN.L  
1                NaN      NaN  
2  6728.265398070333  1704325  
3  6754.841165914164  1090818  
4  6793.818870267201  1348181  


In [4]:
df = pd.read_csv(os.path.join(data_folder, file), skiprows=1)
print(df.head())  


       Ticker        AZN.L      AZN.L.1      AZN.L.2      AZN.L.3    AZN.L.4
0        Date          NaN          NaN          NaN          NaN        NaN
1  2020-01-02  6785.847656  6794.706465  6701.688971  6728.265398  1704325.0
2  2020-01-03  6821.282227  6821.282227  6715.791471  6754.841166  1090818.0
3  2020-01-06  6753.068359  6803.563558  6684.855548  6793.818870  1348181.0
4  2020-01-07  6772.558594  6790.276209  6699.916369  6739.781005  1308820.0


In [6]:
df = pd.read_csv(os.path.join(data_folder, file), skiprows=1)


df.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]

# Drop any NaN rows in the Date column
df = df.dropna(subset=["Date"])

# Ensure the Date column is in string format before parsing
df["Date"] = df["Date"].astype(str)


df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d", errors="coerce")  

df = df.dropna(subset=["Date"])

df.set_index("Date", inplace=True)

print(df.head())  # Check output


                  Close         High          Low         Open     Volume
Date                                                                     
2020-01-02  6785.847656  6794.706465  6701.688971  6728.265398  1704325.0
2020-01-03  6821.282227  6821.282227  6715.791471  6754.841166  1090818.0
2020-01-06  6753.068359  6803.563558  6684.855548  6793.818870  1348181.0
2020-01-07  6772.558594  6790.276209  6699.916369  6739.781005  1308820.0
2020-01-08  6755.727539  6793.820417  6714.091138  6747.754611  1256533.0


In [7]:
import pandas as pd
import os

os.makedirs("processed_data", exist_ok=True)

def calculate_technical_indicators(df):
    """Computes technical indicators for stock data."""
    df["SMA_50"] = df["Close"].rolling(window=50).mean()
    df["SMA_200"] = df["Close"].rolling(window=200).mean()
    df["EMA_20"] = df["Close"].ewm(span=20, adjust=False).mean()
    df["EMA_50"] = df["Close"].ewm(span=50, adjust=False).mean()
    
    # RSI Calculation
    delta = df["Close"].diff()
    gain = (delta.where(delta > 0, 0)).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / loss
    df["RSI_14"] = 100 - (100 / (1 + rs))
    
    # MACD Calculation
    df["EMA_12"] = df["Close"].ewm(span=12, adjust=False).mean()
    df["EMA_26"] = df["Close"].ewm(span=26, adjust=False).mean()
    df["MACD"] = df["EMA_12"] - df["EMA_26"]
    
    # Bollinger Bands
    df["Bollinger_Upper"] = df["SMA_50"] + (df["Close"].rolling(20).std() * 2)
    df["Bollinger_Lower"] = df["SMA_50"] - (df["Close"].rolling(20).std() * 2)
    
    return df

def process_all_stocks(data_folder="data", output_folder="processed_data"):
    """Loads all stock CSV files, applies feature engineering, and saves processed files."""
    for file in os.listdir(data_folder):
        if file.endswith("_historical.csv"):
            stock_name = file.split("_historical.csv")[0]
            file_path = os.path.join(data_folder, file)
            print(f"Processing {stock_name} from {file_path}...")
            
            try:
                df = pd.read_csv(file_path, skiprows=1)
                df.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]
                df = df.dropna(subset=["Date"])
                df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d", errors="coerce")
                df = df.dropna(subset=["Date"])
                df.set_index("Date", inplace=True)
                
                df = calculate_technical_indicators(df)
                
                output_path = os.path.join(output_folder, f"{stock_name}_processed.csv")
                df.to_csv(output_path)
                print(f"Processed {stock_name} successfully!")
            
            except Exception as e:
                print(f"Error processing {file}: {e}")

    print("Feature engineering complete.")

if __name__ == "__main__":
    process_all_stocks()
    print("Processed data saved in 'processed_data/' directory.")


Processing AZN.L from data\AZN.L_historical.csv...
Processed AZN.L successfully!
Processing BATS.L from data\BATS.L_historical.csv...
Processed BATS.L successfully!
Processing BP.L from data\BP.L_historical.csv...
Processed BP.L successfully!
Processing GSK.L from data\GSK.L_historical.csv...
Processed GSK.L successfully!
Processing HSBA.L from data\HSBA.L_historical.csv...
Processed HSBA.L successfully!
Processing LSEG.L from data\LSEG.L_historical.csv...
Processed LSEG.L successfully!
Processing REL.L from data\REL.L_historical.csv...
Processed REL.L successfully!
Processing RIO.L from data\RIO.L_historical.csv...
Processed RIO.L successfully!
Processing SHEL.L from data\SHEL.L_historical.csv...
Processed SHEL.L successfully!
Processing ULVR.L from data\ULVR.L_historical.csv...
Processed ULVR.L successfully!
Feature engineering complete.
Processed data saved in 'processed_data/' directory.
