In [1]:
import pandas as pd

class SharePriceProcessor:
    """
    A class to process, transform, and save share price data.
    """

    def __init__(self):
        """
        Initialize the processor with the hardcoded file path to the raw data.
        """
        self.filepath = 'data/raw/de_share_prices_data_RAW.csv'
        self.raw_prices = None
        self.mode_shares = None


    def load_data(self):
        """
        Load the raw share price data from the file.
        """
        self.raw_prices = pd.read_csv(self.filepath)
        self.raw_prices['Date'] = pd.to_datetime(self.raw_prices['Date'])

    def extract_date_features(self, df):
        """
        Extract features from the Date column.

        Args:
            df (pd.DataFrame): The DataFrame containing the Date column.
        """
        df['Day_of_Week'] = df['Date'].dt.day_name()
        df['Month'] = df['Date'].dt.month
        df['Year'] = df['Date'].dt.year
        df['Day_of_Month'] = df['Date'].dt.day

    def drop_columns(self):
        """
        Drop unnecessary columns from the data.
        """
        if 'Dividend' in self.raw_prices.columns:
            self.raw_prices = self.raw_prices.drop(columns=['Dividend'])

    def calculate_mode_shares(self):
        """
        Calculate the mode of Shares Outstanding grouped by Ticker, Year, and Month.
        """
        self.mode_shares = (
            self.raw_prices.groupby(['Ticker', 'Year', 'Month'])['Shares Outstanding']
            .apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
        )

    def fill_missing_values(self):
        """
        Fill missing values in the DataFrame using forward fill and backward fill.
        """
        self.raw_prices.ffill(inplace=True)
        self.raw_prices.bfill(inplace=True)


    def transform_data(self):
        """
        Transforms the extracted data by handling missing values, 
        creating a percentage change column, categorizing price movements,
        and extracting features from the Date column.
        """
        try:
            # Fill missing values
            self.fill_missing_values()

            # Extract features from Date
            self.extract_date_features(self.raw_prices)

            # Calculate percentage change
            self.raw_prices['Price_Change'] = self.raw_prices['Close'].pct_change() * 100

            # Categorize price movements
            def categorize_change(change):
                if change >= 3:
                    return 'High Rise'
                elif 0 < change < 3:
                    return 'Low Rise'
                elif -0.5 <= change <= 0.5:
                    return 'Stay'
                elif -3 < change < 0:
                    return 'Low Fall'
                else:
                    return 'High Fall'

            self.raw_prices['Category'] = self.raw_prices['Price_Change'].apply(categorize_change)
        except Exception as e:
            print(f"Error during transformation: {e}")

    def save_data(self):
        """
        Saves the transformed data into a CSV file with a hardcoded path.
        """
        output_file = 'data/processed/de_share_prices_processed.csv'

        try:
            self.raw_prices.to_csv(output_file, index=False)
            print(f"Transformed data saved to {output_file}")
        except Exception as e:
            print(f"Error during saving: {e}")


    def process_data(self):
        """
        Run all processing steps on the raw data and save the transformed data.

        Args:
            output_file (str): File path to save the transformed data.
        """
        self.load_data()
        self.extract_date_features(self.raw_prices)
        self.drop_columns()
        self.calculate_mode_shares()
        self.fill_missing_values()
        self.transform_data()
        self.save_data()



# PROCESS AND SAVE THE DATA
processor = SharePriceProcessor()
processor.process_data()


Transformed data saved to data/processed/de_share_prices_processed.csv


In [2]:
df = pd.read_csv('data/processed/de_share_prices_processed.csv')
df.head()

Unnamed: 0,Ticker,Date,SimFinId,Open,High,Low,Close,Adj. Close,Volume,Shares Outstanding,Day_of_Week,Month,Year,Day_of_Month,Price_Change,Category
0,1COV.DE,2019-04-08,854465,59.88,60.11,58.96,59.3,45.78,988273,182704602.0,Monday,4,2019,8,,High Fall
1,1COV.DE,2019-04-09,854465,59.26,59.59,58.11,58.22,44.95,947739,182704602.0,Tuesday,4,2019,9,-1.821248,Low Fall
2,1COV.DE,2019-04-10,854465,58.59,59.7,58.48,59.02,45.57,1011537,182704602.0,Wednesday,4,2019,10,1.374098,Low Rise
3,1COV.DE,2019-04-11,854465,58.78,60.81,58.46,60.51,46.72,1415119,182704602.0,Thursday,4,2019,11,2.524568,Low Rise
4,1COV.DE,2019-04-12,854465,60.7,63.14,59.68,62.62,48.35,2215258,182704602.0,Friday,4,2019,12,3.487027,High Rise
