# PREPARING THE DATA 

THE FIRST BLOCKs OF CODE PREPARES THE DATA NEEDED TO BUILD THE MODEL. HERE WE GOT DATA FROM YAHOO FINANCE FROM PUBLIC COMPANIES AND CLEANED IT EXTRACTING THE KEY DETAILS WHICH WILL BE NEEDED. HOWEVER TO ENSURE ITS ROBUST ENOUGH WE SYNTHETICALLY CREATED DATA AND ADDED IT TO THE DATA. THE TOTAL DATA IS WHAT WILL BE USED

# IMPORTING THE NECCESSARY PACKAGES TO PREPARE THE DATA AND EXECUTE/MAKE A CASH FLOW PREDICTION MODEL

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import warnings
import yfinance as yf
import pandas as pd
import os
import time
import random
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata

# CREATING THE DATASET

This block of code creates the necessary data which we will used in building the predictive model

In [2]:
# Creating the Function to fetch and save financial data from Yahoo Finance
def save_financial_data(ticker_symbol, start_year, end_year, output_dir, max_retries=3, retry_delay=5, quarterly=True):
    ticker = yf.Ticker(ticker_symbol)
    try:
        print(f"\nFetching data for ticker: {ticker_symbol}")
        company_dir = os.path.join(output_dir, ticker_symbol)
        os.makedirs(company_dir, exist_ok=True)
        company_dfs = {}
        for statement_type in ["balance_sheet", "income_stmt", "cashflow"]:
            for attempt in range(max_retries):
                try:
                    print(f"Attempting to retrieve {statement_type} (Attempt {attempt + 1}/{max_retries})...")
                    if quarterly:
                        statement = getattr(ticker, f"quarterly_{statement_type}")
                    else:
                        statement = getattr(ticker, statement_type)
                    if statement is not None and not statement.empty:
                        statement = statement.T
                        statement.index = pd.to_datetime(statement.index)
                        statement = statement.sort_index()
                        statement = statement[str(start_year):str(end_year)]
                        if not statement.empty:
                            company_dfs[statement_type] = statement
                            print(f"Retrieved {statement_type} data for {ticker_symbol}")
                            break
                        else:
                            print(f"No {statement_type} data found for {ticker_symbol}")
                            break
                except Exception as e:
                    print(f"Attempt {attempt + 1} failed for {statement_type} of {ticker_symbol}: {e}")
                    if attempt < max_retries - 1:
                        sleep_time = retry_delay + random.uniform(0, 3)
                        print(f"Retrying in {sleep_time:.2f} seconds...")
                        time.sleep(sleep_time)
            else:
                print(f"Failed to retrieve {statement_type} for {ticker_symbol} after {max_retries} attempts.")
        if company_dfs:
            combined_df = pd.concat(company_dfs, axis=1, keys=company_dfs.keys())
            combined_df.to_csv(os.path.join(company_dir, f"{ticker_symbol}_combined_financial_data.csv"))
            print(f"Combined data for {ticker_symbol} saved.")
        else:
            print(f"No financial data retrieved for {ticker_symbol}")
    except Exception as e:
        print(f"Error getting ticker data for {ticker_symbol}: {e}")

# Combining all company data into one file
def combine_all_company_data(data_dir, output_filename="combined_financial_data.csv", output_folder="csv_data"):
    os.makedirs(output_folder, exist_ok=True)
    combined_file_path = os.path.join(output_folder, output_filename)
    all_dfs = []
    for company_folder in os.listdir(data_dir):
        company_path = os.path.join(data_dir, company_folder)
        if os.path.isdir(company_path):
            for filename in os.listdir(company_path):
                if filename.endswith(".csv") and "combined_financial_data" in filename:
                    file_path = os.path.join(company_path, filename)
                    try:
                        df = pd.read_csv(file_path, index_col=0, header=[0, 1])
                        df.index = pd.to_datetime(df.index)
                        df['Company'] = company_folder
                        all_dfs.append(df)
                    except pd.errors.EmptyDataError:
                        print(f"Skipping empty file: {file_path}")
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
    if all_dfs:
        combined_df = pd.concat(all_dfs, sort=True)
        combined_df.sort_index(inplace=True)
        combined_df.to_csv(combined_file_path)
        print(f"Combined data saved to {combined_file_path}")
    else:
        print("No CSV files found to combine.")

# Cleaning the financial data
def clean_financial_data(input_file, output_file, output_folder="csv_data"):
    try:
        os.makedirs(output_folder, exist_ok=True)
        output_file_path = os.path.join(output_folder, output_file)
        df = pd.read_csv(input_file, index_col=0, header=[0, 1])
        df.index = pd.to_datetime(df.index)
        key_metrics = [
            ('balance_sheet', 'Accounts Payable'),
            ('balance_sheet', 'Accounts Receivable'),
            ('income_stmt', 'Total Revenue'),
            ('income_stmt', 'Gross Profit'),
            ('income_stmt', 'Net Income'),
            ('cashflow', 'Operating Cash Flow'),
            ('cashflow', 'Investing Cash Flow'),
            ('cashflow', 'Financing Cash Flow'),
            ('cashflow', 'Change In Working Capital'),
        ]
        key_metrics = [col for col in key_metrics if col in df.columns]
        df = df.loc[:, key_metrics]
        df.columns = ['_'.join(col).strip() for col in df.columns.values]
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df = df.dropna()
        df.to_csv(output_file_path)
        print(f"Cleaned financial data saved to {output_file_path}")
    except Exception as e:
        print(f"Error cleaning financial data: {e}")

# Generating synthetic data with 80% random replacement
def generate_synthetic_data_with_randomness_and_range(
    cleaned_file, num_rows=5000, min_value=100_000, max_value=1_000_000_000, 
    output_file="synthetic_financial_data.csv", output_folder="csv_data"
):
    try:
        os.makedirs(output_folder, exist_ok=True)
        cleaned_data = pd.read_csv(cleaned_file)
        cleaned_data = cleaned_data.select_dtypes(include=[np.number])  # Ensure numerical-only columns
        
        # Creating the metadata and synthesizer
        metadata = SingleTableMetadata()
        metadata.detect_from_dataframe(cleaned_data)
        synthesizer = GaussianCopulaSynthesizer(metadata)
        synthesizer.fit(cleaned_data)
        
        # Generating the synthetic data
        synthetic_data = synthesizer.sample(num_rows)
        
        # Applying 80% random replacement with constraints
        for column in synthetic_data.columns:
            synthetic_data[column] = synthetic_data[column].apply(
                lambda x: np.random.randint(min_value, max_value) if np.random.rand() < 0.8 else x
            )
            synthetic_data[column] = synthetic_data[column].clip(lower=min_value, upper=max_value)
        
        synthetic_data.to_csv(os.path.join(output_folder, output_file), index=False)
        print(f"Synthetic data (80% randomness with range constraints) saved to {output_folder}/{output_file}")
    except Exception as e:
        print(f"Error generating synthetic data with randomness and range: {e}")

# Combining the real and synthetic data
def combine_real_and_synthetic_data(cleaned_file, synthetic_file, output_file="combined_real_and_synthetic_data.csv", output_folder="csv_data"):
    try:
        os.makedirs(output_folder, exist_ok=True)
        cleaned_data = pd.read_csv(cleaned_file)
        synthetic_data = pd.read_csv(synthetic_file)
        combined_data = pd.concat([cleaned_data, synthetic_data], ignore_index=True)
        combined_data.to_csv(os.path.join(output_folder, output_file), index=False)
        print(f"Combined real and synthetic data saved to {output_folder}/{output_file}")
    except Exception as e:
        print(f"Error combining real and synthetic data: {e}")

# Executining the  pipeline
if __name__ == "__main__":
    dataset_folder = "Cash Flow Prediction Dataset"
    financial_data_folder = os.path.join(dataset_folder, "financial_data")
    csv_data_folder = os.path.join(dataset_folder, "csv_data")

    output_dir = financial_data_folder
    combined_file = "combined_financial_data.csv"
    cleaned_file = "cleaned_financial_data.csv"
    synthetic_file = "synthetic_financial_data.csv"
    combined_real_and_synthetic_file = "combined_real_and_synthetic_data.csv"

    companies = [
        {"symbol": "AAPL", "industry": "Tech"},
    {"symbol": "MSFT", "industry": "Tech"},
    {"symbol": "GOOG", "industry": "Tech"},
    {"symbol": "NVDA", "industry": "Tech"},
    {"symbol": "IBM", "industry": "Tech"},
    {"symbol": "ORCL", "industry": "Tech"},
    {"symbol": "JPM", "industry": "Banks"},
    {"symbol": "BAC", "industry": "Banks"},
    {"symbol": "WFC", "industry": "Banks"},
    {"symbol": "C", "industry": "Banks"},
    {"symbol": "GS", "industry": "Banks"},
    {"symbol": "MS", "industry": "Banks"},
    {"symbol": "XOM", "industry": "Oil"},
    {"symbol": "CVX", "industry": "Oil"},
    {"symbol": "SLB", "industry": "Oil"},
    {"symbol": "COP", "industry": "Oil"},
    {"symbol": "PG", "industry": "Consumer Goods"},
    {"symbol": "PEP", "industry": "Consumer Goods"},
    {"symbol": "KO", "industry": "Consumer Goods"},
    {"symbol": "CL", "industry": "Consumer Goods"},
    {"symbol": "UL", "industry": "Consumer Goods"},
    {"symbol": "PM", "industry": "Consumer Goods"},
    {"symbol": "CAT", "industry": "Industrials"},
    {"symbol": "LMT", "industry": "Industrials"},
    {"symbol": "BA", "industry": "Industrials"},
    {"symbol": "RTX", "industry": "Industrials"},
    {"symbol": "GE", "industry": "Industrials"},
    {"symbol": "HON", "industry": "Industrials"},
    {"symbol": "SPG", "industry": "Real Estate"},
    {"symbol": "PLD", "industry": "Real Estate"},
    {"symbol": "O", "industry": "Real Estate"},
    {"symbol": "EQR", "industry": "Real Estate"},
    {"symbol": "VNO", "industry": "Real Estate"},
    {"symbol": "BXP", "industry": "Real Estate"},
    {"symbol": "TSLA", "industry": "Automotive"},
    {"symbol": "F", "industry": "Automotive"},
    {"symbol": "GM", "industry": "Automotive"},
    {"symbol": "RIVN", "industry": "Automotive"},
    {"symbol": "TM", "industry": "Automotive"},
    {"symbol": "HMC", "industry": "Automotive"},
    {"symbol": "JNJ", "industry": "Healthcare"},
    {"symbol": "PFE", "industry": "Healthcare"},
    {"symbol": "MRK", "industry": "Healthcare"},
    {"symbol": "ABBV", "industry": "Healthcare"},
    {"symbol": "TMO", "industry": "Healthcare"},
    {"symbol": "DHR", "industry": "Healthcare"},
    {"symbol": "AMZN", "industry": "Retail"},
    {"symbol": "WMT", "industry": "Retail"},
    {"symbol": "TGT", "industry": "Retail"},
    {"symbol": "COST", "industry": "Retail"},
    {"symbol": "HD", "industry": "Retail"},
    {"symbol": "LOW", "industry": "Retail"},
    {"symbol": "DIS", "industry": "Entertainment"},
    {"symbol": "NFLX", "industry": "Entertainment"},
    {"symbol": "CMCSA", "industry": "Entertainment"},
    {"symbol": "VIA", "industry": "Entertainment"},
    {"symbol": "FOX", "industry": "Entertainment"},
    {"symbol": "SPOT", "industry": "Entertainment"},
    {"symbol": "V", "industry": "Financial Services"},
    {"symbol": "MA", "industry": "Financial Services"},
    {"symbol": "PYPL", "industry": "Financial Services"},
    {"symbol": "SQ", "industry": "Financial Services"},
    {"symbol": "AXP", "industry": "Financial Services"},
    {"symbol": "COF", "industry": "Financial Services"},
    {"symbol": "INTC", "industry": "Tech"},
    {"symbol": "AMD", "industry": "Tech"},
    {"symbol": "TSM", "industry": "Tech"},
    {"symbol": "ADBE", "industry": "Tech"},
    {"symbol": "CRM", "industry": "Tech"},
    {"symbol": "NOW", "industry": "Tech"},
    {"symbol": "META", "industry": "Tech"},
    {"symbol": "SNAP", "industry": "Tech"},
    {"symbol": "TWTR", "industry": "Tech"},
    {"symbol": "SHOP", "industry": "Tech"},
    {"symbol": "NKE", "industry": "Consumer Goods"},
    {"symbol": "ADDYY", "industry": "Consumer Goods"},
    {"symbol": "RL", "industry": "Consumer Goods"},
    {"symbol": "TIF", "industry": "Consumer Goods"},
    {"symbol": "UPS", "industry": "Logistics"},
    {"symbol": "FDX", "industry": "Logistics"},
    {"symbol": "DHL", "industry": "Logistics"},
    {"symbol": "CSX", "industry": "Logistics"},
    {"symbol": "NSC", "industry": "Logistics"},
    {"symbol": "UNP", "industry": "Logistics"},
    {"symbol": "SO", "industry": "Utilities"},
    {"symbol": "DUK", "industry": "Utilities"},
    {"symbol": "NEE", "industry": "Utilities"},
    {"symbol": "D", "industry": "Utilities"},
    {"symbol": "EXC", "industry": "Utilities"},
    {"symbol": "PPL", "industry": "Utilities"},
    {"symbol": "MCD", "industry": "Restaurants"}, 
    {"symbol": "YUM", "industry": "Restaurants"}, 
    {"symbol": "SBUX", "industry": "Restaurants"}, 
    {"symbol": "CMG", "industry": "Restaurants"}, 
    {"symbol": "MELI", "industry": "E-commerce"}, 
    {"symbol": "EBAY", "industry": "E-commerce"}, 
    {"symbol": "ETSY", "industry": "E-commerce"}, 
    {"symbol": "AZO", "industry": "Retail"}, 
    {"symbol": "TJX", "industry": "Retail"}, 
    {"symbol": "ROKU", "industry": "Entertainment"}, 
    {"symbol": "DISCA", "industry": "Entertainment"}, 
    {"symbol": "CHTR", "industry": "Telecommunications"}, 
    {"symbol": "T", "industry": "Telecommunications"}, 
    {"symbol": "VZ", "industry": "Telecommunications"}, 
    {"symbol": "AAP", "industry": "Insurance"}, 
    {"symbol": "MET", "industry": "Insurance"}, 
    {"symbol": "AIG", "industry": "Insurance"}, 
    {"symbol": "BRK-B", "industry": "Conglomerate"}, 
    {"symbol": "AVGO", "industry": "Tech"}, 
    {"symbol": "NFLX", "industry": "Entertainment"}, 
    {"symbol": "DIS", "industry": "Entertainment"}, 
    {"symbol": "ABNB", "industry": "Tech"}, 
    {"symbol": "IBM", "industry": "Tech"}, 
    {"symbol": "ORCL", "industry": "Tech"}, 
    {"symbol": "WMT", "industry": "Retail"}, 
    {"symbol": "JNJ", "industry": "Healthcare"}, 
    {"symbol": "HOOD", "industry": "Banks"} 
    ]

    for company in companies:
        save_financial_data(company["symbol"], 2020, 2024, output_dir, quarterly=True)

    combine_all_company_data(financial_data_folder, combined_file, csv_data_folder)
    clean_financial_data(os.path.join(csv_data_folder, combined_file), cleaned_file, csv_data_folder)
    generate_synthetic_data_with_randomness_and_range(
        os.path.join(csv_data_folder, cleaned_file),
        num_rows=5000,
        output_file=synthetic_file,
        output_folder=csv_data_folder
    )
    combine_real_and_synthetic_data(
        os.path.join(csv_data_folder, cleaned_file),
        os.path.join(csv_data_folder, synthetic_file),
        output_file=combined_real_and_synthetic_file,
        output_folder=csv_data_folder
    )


Fetching data for ticker: AAPL
Attempting to retrieve balance_sheet (Attempt 1/3)...
Retrieved balance_sheet data for AAPL
Attempting to retrieve income_stmt (Attempt 1/3)...
Retrieved income_stmt data for AAPL
Attempting to retrieve cashflow (Attempt 1/3)...
Retrieved cashflow data for AAPL
Combined data for AAPL saved.

Fetching data for ticker: MSFT
Attempting to retrieve balance_sheet (Attempt 1/3)...
Retrieved balance_sheet data for MSFT
Attempting to retrieve income_stmt (Attempt 1/3)...
Retrieved income_stmt data for MSFT
Attempting to retrieve cashflow (Attempt 1/3)...
Retrieved cashflow data for MSFT
Combined data for MSFT saved.

Fetching data for ticker: GOOG
Attempting to retrieve balance_sheet (Attempt 1/3)...
Retrieved balance_sheet data for GOOG
Attempting to retrieve income_stmt (Attempt 1/3)...
Retrieved income_stmt data for GOOG
Attempting to retrieve cashflow (Attempt 1/3)...
Retrieved cashflow data for GOOG
Combined data for GOOG saved.

Fetching data for ticker: N



Synthetic data (80% randomness with range constraints) saved to Cash Flow Prediction Dataset/csv_data/synthetic_financial_data.csv
Combined real and synthetic data saved to Cash Flow Prediction Dataset/csv_data/combined_real_and_synthetic_data.csv


# CREATING THE CASH FLOW PREDICTION MODEL

This model should enable us predict the quarterly cash flows of companies using their bank statements

In [3]:
warnings.filterwarnings("ignore")

# Setting a consistent random seed for reproducibility
SEED = 42
np.random.seed(SEED)

# The Path to the dataset
dataset_path = r'/Users/abduljalaalabubakar/Desktop/Projects/Symply Finance/Finance Model/Cash Flow Prediction Dataset/csv_data/combined_real_and_synthetic_data.csv'

# Loading the dataset
df = pd.read_csv(dataset_path)

# Adding synthetic features if missing
if 'balance_sheet_Total Assets' not in df.columns:
    print("Adding synthetic 'balance_sheet_Total Assets' column.")
    df['balance_sheet_Total Assets'] = (
        df.get('balance_sheet_Accounts Payable', np.random.uniform(100_000, 1_000_000, len(df))) +
        df.get('balance_sheet_Accounts Receivable', np.random.uniform(100_000, 1_000_000, len(df))) +
        df.get('cashflow_Operating Cash Flow', np.random.uniform(100_000, 1_000_000, len(df)))
    )

if 'balance_sheet_Total Current Assets' not in df.columns:
    print("Adding synthetic 'balance_sheet_Total Current Assets' column.")
    df['balance_sheet_Total Current Assets'] = (
        df.get('balance_sheet_Accounts Receivable', np.random.uniform(100_000, 1_000_000, len(df))) +
        np.random.randint(100_000, 1_000_000_000, size=len(df))
    )

if 'balance_sheet_Total Current Liabilities' not in df.columns:
    print("Generating synthetic 'balance_sheet_Total Current Liabilities' column.")
    df['balance_sheet_Total Current Liabilities'] = np.random.randint(
        low=100_000, high=1_000_000_000, size=len(df)
    )

# Handling divisions by zero and invalid values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Feature Engineering
df['Current_Ratio'] = np.where(
    df['balance_sheet_Total Current Liabilities'] > 0,
    df['balance_sheet_Total Current Assets'] / df['balance_sheet_Total Current Liabilities'],
    0
)

df['Quick_Ratio'] = np.where(
    df['balance_sheet_Total Current Liabilities'] > 0,
    (df['balance_sheet_Total Current Assets'] - df.get('balance_sheet_Inventory', 0)) / df['balance_sheet_Total Current Liabilities'],
    0
)

df['Debt_to_Equity'] = np.where(
    df['balance_sheet_Total Assets'] > 0,
    df['balance_sheet_Accounts Payable'] / df['balance_sheet_Total Assets'],
    0
)

df['Return_on_Assets'] = np.where(
    df['balance_sheet_Total Assets'] > 0,
    df['income_stmt_Net Income'] / df['balance_sheet_Total Assets'],
    0
)

df['Operating_Margin'] = np.where(
    df['income_stmt_Total Revenue'] > 0,
    df['income_stmt_Gross Profit'] / df['income_stmt_Total Revenue'],
    0
)

# Lagged Features
df['Lagged_Revenue'] = df['income_stmt_Total Revenue'].shift(1).fillna(0)
df['Lagged_Net_Income'] = df['income_stmt_Net Income'].shift(1).fillna(0)
df['Lagged_Operating_Cash_Flow'] = df['cashflow_Operating Cash Flow'].shift(1).fillna(0)

# Interaction Features
df['Interaction_Current_Quick'] = df['Current_Ratio'] * df['Quick_Ratio']
df['Interaction_Return_Debt'] = df['Return_on_Assets'] * df['Debt_to_Equity']

# Removing NaN values introduced by feature engineering
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# Target Variable
target_variable = 'cashflow_Operating Cash Flow'

# Handling negative or zero values in the target variable before scaling
df = df[df[target_variable] > 0]

# Normalizing Target Variable using StandardScaler
scaler_y = StandardScaler()
df[target_variable] = scaler_y.fit_transform(df[[target_variable]])

# Features to include
initial_features = [
    'Current_Ratio', 'Quick_Ratio', 'Debt_to_Equity', 'Return_on_Assets',
    'Operating_Margin', 'Lagged_Revenue', 'Lagged_Net_Income', 'Lagged_Operating_Cash_Flow',
    'Interaction_Current_Quick', 'Interaction_Return_Debt'
]

# Multicollinearity Check and Removal
correlation_matrix = df[initial_features].corr().abs()
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
df.drop(columns=to_drop, inplace=True)
features = [feature for feature in initial_features if feature not in to_drop]

# Splitting the dataset
X = df[features]
y = df[target_variable]

# Normalizing the features using PowerTransformer
scaler_X = PowerTransformer()
X_transformed = scaler_X.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=SEED)

# Model training with multiple regressors
models = {
    'RandomForest': RandomForestRegressor(random_state=SEED),
    'GradientBoosting': GradientBoostingRegressor(random_state=SEED),
    'AdaBoost': AdaBoostRegressor(random_state=SEED)
}

best_model = None
best_r2 = -np.inf

# Perform GridSearch and Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for model_name, model in models.items():
    print(f"Training {model_name}...")
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    } if model_name == 'RandomForest' else {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
    } if model_name == 'GradientBoosting' else {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0],
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf, n_jobs=-1, scoring='r2')
    grid_search.fit(X_train, y_train)
    best_estimator = grid_search.best_estimator_

    # Predictions
    y_pred = best_estimator.predict(X_test)

    # Evaluation
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{model_name} - Mean Squared Error: {mse:.4f}, R-Squared: {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_model = best_estimator

# Saving the best model and preprocessed dataset
output_dir = 'Cash_Flow_Prediction_Model'
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir, 'preprocessed_dataset.csv'), index=False)

with open(os.path.join(output_dir, 'cash_flow_model.pkl'), 'wb') as f:
    pickle.dump(best_model, f)

print("Best model and preprocessed dataset saved successfully.")

Adding synthetic 'balance_sheet_Total Assets' column.
Adding synthetic 'balance_sheet_Total Current Assets' column.
Generating synthetic 'balance_sheet_Total Current Liabilities' column.
Training RandomForest...
RandomForest - Mean Squared Error: 0.1848, R-Squared: 0.8010
Training GradientBoosting...
GradientBoosting - Mean Squared Error: 0.3370, R-Squared: 0.6370
Training AdaBoost...
AdaBoost - Mean Squared Error: 0.1734, R-Squared: 0.8132
Best model and preprocessed dataset saved successfully.
