In [1]:
# %load improved_find_data_start_row\(file_path\).py
from openpyxl import load_workbook
import pandas as pd

def improved_find_data_start_row(file_path):
    """
    Improved version to find the data start row in an Excel file.
    This version scans the first sheet to find the likely start of the data based on a heuristic that considers
    both the number of non-empty cells in a row and the consistency of data types in the row.
    """
    workbook = load_workbook(filename=file_path, read_only=True)
    sheet = workbook.worksheets[0]  # Assuming data is in the first sheet
    
    min_filled_cells = 2  # Minimum number of non-empty cells to consider a row as potential data start
    likely_data_start_row = 0
    consecutive_data_like_rows = 0
    
    for i, row in enumerate(sheet.iter_rows(values_only=True)):
        # Count non-empty cells and types of data in the row
        non_empty_cells = [cell for cell in row if cell is not None]
        filled_cells = len(non_empty_cells)
        data_types = {type(cell) for cell in non_empty_cells}
        
        # Check for a row with enough filled cells and more than one type of data (mixed types suggest data row)
        if filled_cells >= min_filled_cells and len(data_types) > 1:
            consecutive_data_like_rows += 1
        else:
            consecutive_data_like_rows = 0  # Reset if the row doesn't seem like a data row
        
        # If we find 2 consecutive rows that look like data, it's likely the start of the data
        if consecutive_data_like_rows >= 2:
            likely_data_start_row = i - 2  # Adjusting for 0-based index and to include the first data-like row
            break
    
    return likely_data_start_row

def improved_load_excel_with_autodetect(file_path):
    """
    Load an Excel file into a pandas DataFrame, attempting to auto-detect the start of the actual data using
    an improved mechanism.
    """
    start_row = improved_find_data_start_row(file_path)
    df = pd.read_excel(file_path, sheet_name=0, skiprows=start_row)
    df.dropna(axis=1, how='all', inplace=True)
    return df

def load_all_sheets_with_data_start_detection(file_path):
    """
    Load all sheets from an Excel workbook, applying an improved mechanism to detect
    the start of actual data in each sheet.
    
    :param file_path: Path to the Excel workbook.
    :return: A dictionary of DataFrames, one for each sheet, with data start auto-detected.
    """
    # Load all sheets into a dictionary of DataFrames
    all_sheets = pd.read_excel(file_path, sheet_name=None, header=None)
    
    # Apply the improved data start detection mechanism to each sheet
    for sheet_name, df in all_sheets.items():
        # Find the likely data start row using the improved mechanism
        start_row = improved_find_data_start_row(file_path)
        # Reload the sheet with detected start row, if there is meaningful data to skip
        if start_row > 0:
            all_sheets[sheet_name] = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=start_row)
    
    return all_sheets


In [2]:
financial_data=improved_load_excel_with_autodetect("/Users/myself/Desktop/Walmart USA Serching for Growth/walmartHistoricalFinancials.xlsx")
financial_data=financial_data.dropna()
financial_data=financial_data.fillna(0)

In [3]:
# Correctly identify and assign unique years to each financial data column
# Assuming the first column after 'Financial Metric' is the most recent year (2021) and decrement for each column after

number_of_years = financial_data.shape[1] - 1  # Total columns minus the 'Financial Metric' column
base_year = 2021
years = [str(base_year - i) for i in range(number_of_years)]

# Map the new year labels to the columns
financial_data.columns = ['Financial Metric'] + years

financial_data['Financial Metric']

0                               Non-current assets
1                              ∟ Intangible assets
2                          ∟ Tangible fixed assets
3                       ∟ Other non-current assets
4                                   Current assets
5                                          ∟ Stock
6                                        ∟ Debtors
7                           ∟ Other current assets
8              ∟ Of which cash and cash equivalent
9                                     Total assets
12                              Shareholders funds
13                                       ∟ Capital
14                      ∟ Other shareholders funds
15                         Non-current liabilities
16                                ∟ Long term debt
17                 ∟ Other non-current liabilities
18                           ∟ Of which provisions
19                             Current liabilities
20                       ∟ Loans & short-term debt
21                             

In [4]:
import numpy as np

financial_data['Financial Metric']=financial_data['Financial Metric'].str.replace('∟', '', regex=True)
financial_data.replace('\xa0', '', regex=True, inplace=True)
financial_data.replace('n.a.', np.nan, inplace=True)

financial_data=financial_data.dropna(thresh=3)
financial_data

Unnamed: 0,Financial Metric,2021,2020,2019,2018,2017,2016,2015,2014,2013,...,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995
0,Non-current assets,162429000.0,174689000.0,157398000.0,144858000.0,141136000.0,139342000.0,140212000.0,143566000.0,143165000.0,...,70984000.0,64086000.0,55649000.0,51575000.0,45993000.0,28864000.0,26032000.0,21611000.0,20210000.0,17481000.0
1,Intangible assets,28983000.0,31073000.0,31181000.0,18242000.0,17037000.0,16695000.0,18102000.0,19510000.0,20497000.0,...,9882000.0,9389000.0,8566000.0,9059000.0,9392000.0,2538000.0,0.0,0.0,0.0,0.0
2,Tangible fixed assets,109848000.0,127049000.0,111395000.0,114818000.0,114178000.0,116516000.0,116655000.0,117907000.0,116681000.0,...,59023000.0,51374000.0,45750000.0,40934000.0,35969000.0,25973000.0,23606000.0,20324000.0,18894000.0,15874000.0
3,Other non-current assets,23598000.0,16567000.0,14822000.0,11798000.0,9921000.0,6131000.0,5455000.0,6149000.0,5987000.0,...,2079000.0,3323000.0,1333000.0,1582000.0,632000.0,353000.0,2426000.0,1287000.0,1316000.0,1607000.0
4,Current assets,90067000.0,61806000.0,61897000.0,59664000.0,57689000.0,60239000.0,63278000.0,61185000.0,59940000.0,...,34421000.0,30722000.0,27878000.0,26555000.0,24356000.0,21132000.0,19352000.0,17993000.0,17331000.0,15338000.0
5,Stock,44949000.0,44435000.0,44269000.0,43783000.0,43046000.0,44469000.0,45141000.0,44858000.0,43803000.0,...,26612000.0,24401000.0,22614000.0,21442000.0,19793000.0,17076000.0,16497000.0,15897000.0,15989000.0,14064000.0
6,Debtors,6516000.0,6284000.0,6283000.0,5614000.0,5835000.0,5624000.0,6778000.0,6677000.0,6768000.0,...,1254000.0,1569000.0,2000000.0,1768000.0,1341000.0,1118000.0,976000.0,845000.0,853000.0,700000.0
7,Other current assets,38602000.0,11087000.0,11345000.0,10267000.0,8808000.0,10146000.0,11359000.0,9650000.0,9369000.0,...,6555000.0,4752000.0,3264000.0,3345000.0,3222000.0,2938000.0,1879000.0,1251000.0,489000.0,574000.0
8,Of which cash and cash equivalent,17741000.0,9465000.0,7722000.0,6456000.0,6602000.0,8343000.0,9135000.0,6627000.0,7066000.0,...,5199000.0,2736000.0,2161000.0,2054000.0,1856000.0,1879000.0,1447000.0,883000.0,83000.0,45000.0
9,Total assets,252496000.0,236495000.0,219295000.0,204522000.0,198825000.0,199581000.0,203490000.0,204751000.0,203105000.0,...,105405000.0,94808000.0,83527000.0,78130000.0,70349000.0,49996000.0,45384000.0,39604000.0,37541000.0,32819000.0


In [5]:
df = financial_data.copy()

if df.index.name != 'Financial Metric':
    dfcopy=df
    df.set_index('Financial Metric', inplace=True)  # Setting 'Financial Metric' as index

df.index=df.index.str.strip()

# Determine categories based on keyword presence or row structure
assets = [metric for metric in df.index if "asset" in metric.lower()]
liabilities_and_equity = [metric for metric in df.index if "liabilit" in metric.lower() or "equity" in metric.lower() or "fund" in metric.lower()]
p_and_l = [metric for metric in df.index if metric not in assets and metric not in liabilities_and_equity]

# Example to show categorized lists
print("Assets:", assets)
print("Liabilities & Equity:", liabilities_and_equity)
print("P&L Items:", p_and_l)

Assets: ['Non-current assets', 'Intangible assets', 'Tangible fixed assets', 'Other non-current assets', 'Current assets', 'Other current assets', 'Total assets', 'Net current assets']
Liabilities & Equity: ['Shareholders funds', 'Other shareholders funds', 'Non-current liabilities', 'Other non-current liabilities', 'Current liabilities', 'Other current liabilities', "Total shareholders' funds and liabilities"]
P&L Items: ['Stock', 'Debtors', 'Of which cash and cash equivalent', 'Capital', 'Long term debt', 'Loans & short-term debt', 'Creditors', 'Working capital', 'Enterprise value', 'Number of employees', 'Operating revenue (Turnover)', 'Sales', 'Costs of goods sold', 'Gross profit', 'Other operating expense (income)', 'Operating profit (loss) [EBIT]', 'Financial profit (loss)', 'Financial revenue', 'Financial expenses', 'Profit (loss) before tax [PBT]', 'Income tax expenses (benefit)', 'Profit (loss) after tax [PAT]', 'Net extraordinary revenues (expenses)', 'Profit (loss) for the p

In [6]:
# Create a base mapping dictionary
base_mapping = {}

# Map assets to 'Total assets'
for item in assets:
    base_mapping[item] = 'Total assets'

# Map liabilities and equity to 'Total shareholders' funds and liabilities'
for item in liabilities_and_equity:
    base_mapping[item] = 'Total shareholders\' funds and liabilities'

# Map P&L items to 'Sales' (this could be adjusted to 'Operating revenue' if more appropriate)
for item in p_and_l:
    if 'Revenue' in item or 'Sales' in item:
        base_mapping[item] = 'Sales'
    else:
        base_mapping[item] = 'Sales'  # or 'Total operating revenue' if defined in your dataset

# Print or return the base mapping
base_mapping.items()

dict_items([('Non-current assets', 'Total assets'), ('Intangible assets', 'Total assets'), ('Tangible fixed assets', 'Total assets'), ('Other non-current assets', 'Total assets'), ('Current assets', 'Total assets'), ('Other current assets', 'Total assets'), ('Total assets', 'Total assets'), ('Net current assets', 'Total assets'), ('Shareholders funds', "Total shareholders' funds and liabilities"), ('Other shareholders funds', "Total shareholders' funds and liabilities"), ('Non-current liabilities', "Total shareholders' funds and liabilities"), ('Other non-current liabilities', "Total shareholders' funds and liabilities"), ('Current liabilities', "Total shareholders' funds and liabilities"), ('Other current liabilities', "Total shareholders' funds and liabilities"), ("Total shareholders' funds and liabilities", "Total shareholders' funds and liabilities"), ('Stock', 'Sales'), ('Debtors', 'Sales'), ('Of which cash and cash equivalent', 'Sales'), ('Capital', 'Sales'), ('Long term debt', '

In [7]:
# import pandas as pd
# from sklearn.linear_model import LinearRegression
# import numpy as np



# # Calculate historical ratios
# historical_ratios = {}
# for metric, base in base_mapping.items():
#     historical_ratios[metric] = financial_data[metric] / financial_data[base]

# # Perform regression and forecast future ratios
# future_bases = {'Sales': 600000000, 'Total assets': 270000000}  # example forecasted bases
# forecasts = {}
# for metric, ratios in historical_ratios.items():
#     model = LinearRegression()
#     X = np.array(df.columns[1:].astype(int)).reshape(-1, 1)  # Years as independent variable
#     y = ratios.values.reshape(-1, 1)
#     model.fit(X, y)
#     projected_ratio = model.predict(np.array([[2022]]))  # Example future year
#     forecasts[metric] = projected_ratio * future_bases[base_mapping[metric]]

# # Output the forecasted values
# forecasts


In [8]:
df.index.name
df.columns

Index(['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013',
       '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004',
       '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1995'],
      dtype='object')

## Forecast Base Financial Metrics with Assumed YOY Rates

In [9]:
# Define the growth assumptions based on the details provided earlier
assumptions = {
    'Sales': {'type': 'YOY', 'rates': [0.0814, 0.0392, 0.0261, 0.0244, 0.0237, 0.0238]},
    'Costs of goods sold': {'type': '% of revenue', 'rates': [0.7541, 0.7513, 0.7513, 0.7513, 0.7513, 0.7513]},
    'Gross profit': {'type': 'direct', 'rates': None},  # Calculated directly as difference
    'Operating and SG&A costs': {'type': '% of sales', 'rates': [0.1863, 0.1834, 0.1834, 0.1834, 0.1834, 0.1834]}
}

# Base year for forecast
base_year = '2021'
forecast_years = ['2022F','2023F', '2024F', '2025F', '2026F', '2027F']

# Initialize a DataFrame to hold the forecasted values
forecast_df = pd.DataFrame(index=df.index, columns=forecast_years)

# Iterate over each forecast year and apply the assumptions to calculate forecast values
for idx, year in enumerate(forecast_years):
    if 'Sales' in assumptions:
        if idx == 0:  # First forecast year, base it on the last historical year
            last_sales = df.loc[df.index.str.contains("Sales"), base_year].values[0]
        growth_rate = assumptions['Sales']['rates'][idx]
        forecast_sales = last_sales * (1 + growth_rate)
        forecast_df.loc['Sales', year] = forecast_sales
        last_sales = forecast_sales  # Update for next year's calculation

    if 'Costs of goods sold' in assumptions:
        cost_rate = assumptions['Costs of goods sold']['rates'][idx]
        forecast_costs = forecast_sales * cost_rate
        forecast_df.loc['Costs of goods sold', year] = forecast_costs

    if 'Gross profit' in assumptions:
        forecast_gross_profit = forecast_sales - forecast_costs
        forecast_df.loc['Gross profit', year] = forecast_gross_profit

    if 'Operating and SG&A costs' in assumptions:
        sgna_rate = assumptions['Operating and SG&A costs']['rates'][idx]
        forecast_sgna = forecast_sales * sgna_rate
        forecast_df.loc['Operating and SG&A costs', year] = forecast_sgna

# Calculate CAGR for the forecast period
cagr_df = (forecast_df[forecast_years].astype(float).iloc[:, -1] / forecast_df[forecast_years].astype(float).iloc[:, 0]) ** (1 / (len(forecast_years) - 1)) - 1
forecast_df.loc[:, 'CAGR'] = cagr_df

# Display the forecast results
forecast_df, cagr_df





(                                                     2022F             2023F  \
 Financial Metric                                                               
 Non-current assets                                     NaN               NaN   
 Intangible assets                                      NaN               NaN   
 Tangible fixed assets                                  NaN               NaN   
 Other non-current assets                               NaN               NaN   
 Current assets                                         NaN               NaN   
 Stock                                                  NaN               NaN   
 Debtors                                                NaN               NaN   
 Other current assets                                   NaN               NaN   
 Of which cash and cash equivalent                      NaN               NaN   
 Total assets                                           NaN               NaN   
 Shareholders funds         

In [10]:
# # # Define growth assumptions based on the details provided earlier
# assumptions = {
#     'Sales': {'type': 'YOY', 'rates': [0.0814, 0.0392, 0.0261, 0.0244, 0.0237, 0.0238]},
#     'Costs of goods sold': {'type': '% of revenue', 'rates': [0.7541, 0.7513, 0.7513, 0.7513, 0.7513, 0.7513]},
#     'Gross profit': {'type': 'direct', 'rates': None},  # Calculated directly as difference
#     'Operating and SG&A costs': {'type': '% of sales', 'rates': [0.1863, 0.1834, 0.1834, 0.1834, 0.1834, 0.1834]}
# }

# # # # Calculate future bases from assumptions
# # base_year = '2021'
# # forecast_years = ['2022F', '2023F', '2024F', '2025F', '2026F', '2027F']
# # last_values = df[base_year]

# # for year in forecast_years:
# #     for metric, config in assumptions.items():
# #         if config['type'] == 'YOY':
# #             growth_rate = config['rates'][int(year[:4]) - 2022]
# #             forecast_value = last_values[metric] * (1 + growth_rate)
# #         elif config['type'] == '% of revenue':
# #             revenue_based_rate = config['rates'][int(year[:4]) - 2022]
# #             forecast_value = forecast_df.loc['Sales', year] * revenue_based_rate
# #         elif config['type'] == 'direct':
# #             forecast_value = forecast_df.loc['Sales', year] - forecast_df.loc['Costs of goods sold', year]
# #         forecast_df.loc[metric, year] = forecast_value
# #         last_values[metric] = forecast_value


## Handle NaNs

In [11]:
df = df.T

In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler



# Assuming df is your DataFrame loaded with data

# Identify columns with missing values
cols_with_missing = df.columns[df.isnull().any()].tolist()

# Decide on an imputation strategy for each column
# Example: Use mean for numerical data and a model for 'Costs of employees'
imputation_strategies = {
    col: ('mean' if df[col].dtype.kind in 'biufc' else 'most_frequent')
    for col in cols_with_missing if col != 'Costs of employees'
}
imputation_strategies['Costs of employees'] = 'model'  # Deciding to use a model for 'Costs of employees'

# Apply imputation or model prediction
for col, strategy in imputation_strategies.items():
    if strategy != 'model':
        # Simple imputation
        imputer = SimpleImputer(strategy=strategy)
        df[col] = imputer.fit_transform(df[[col]])
    else:
        # Setup for predictive modeling
        # Assuming you've already identified features to use
        features = df.columns.difference([col, 'SomeOtherColumnToExclude']).tolist()
        train_data = df.dropna(subset=[col] + features)
        target = train_data[col]
        train_features = train_data[features]

        # Scaling features
        scaler = StandardScaler()
        train_features_scaled = scaler.fit_transform(train_features)
        
        # Model fitting
        model = RandomForestRegressor(random_state=0)
        model.fit(train_features_scaled, target)
        
        # Predicting missing values
        test_features = df.loc[df[col].isnull(), features]
        test_features_scaled = scaler.transform(test_features)
        predicted_values = model.predict(test_features_scaled)
        
        # Fill in the missing values
        df.loc[df[col].isnull(), col] = predicted_values

print("Missing values handled for columns:", cols_with_missing)
print(df.info())


Missing values handled for columns: ['Enterprise value', 'Financial revenue', 'Costs of employees', 'Added value']
<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, 2021 to 1995
Data columns (total 46 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Non-current assets                         27 non-null     float64
 1   Intangible assets                          27 non-null     float64
 2   Tangible fixed assets                      27 non-null     float64
 3   Other non-current assets                   27 non-null     float64
 4   Current assets                             27 non-null     float64
 5   Stock                                      27 non-null     float64
 6   Debtors                                    27 non-null     float64
 7   Other current assets                       27 non-null     float64
 8   Of which cash and cash equivalent          27 non-null   

In [22]:
df['Costs of employees']

2021    1169000.0
2020     854000.0
2019     773000.0
2018     626000.0
2017     596000.0
2016     448000.0
2015     462000.0
2014     555630.0
2013     560940.0
2012     559340.0
2011     554940.0
2010     561220.0
2009     565370.0
2008     567220.0
2007     578580.0
2006     571510.0
2005     570030.0
2004     575430.0
2003     573940.0
2002     584410.0
2001     577340.0
2000     568090.0
1999     578680.0
1998     614930.0
1997     614930.0
1996     613450.0
1995     614930.0
Name: Costs of employees, dtype: float64

## Forecast all values on Historical Data

In [23]:
df = df.T
df

Unnamed: 0_level_0,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,...,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995
Financial Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Non-current assets,162429000.0,174689000.0,157398000.0,144858000.0,141136000.0,139342000.0,140212000.0,143566000.0,143165000.0,138431000.0,...,70984000.0,64086000.0,55649000.0,51575000.0,45993000.0,28864000.0,26032000.0,21611000.0,20210000.0,17481000.0
Intangible assets,28983000.0,31073000.0,31181000.0,18242000.0,17037000.0,16695000.0,18102000.0,19510000.0,20497000.0,20651000.0,...,9882000.0,9389000.0,8566000.0,9059000.0,9392000.0,2538000.0,0.0,0.0,0.0,0.0
Tangible fixed assets,109848000.0,127049000.0,111395000.0,114818000.0,114178000.0,116516000.0,116655000.0,117907000.0,116681000.0,112324000.0,...,59023000.0,51374000.0,45750000.0,40934000.0,35969000.0,25973000.0,23606000.0,20324000.0,18894000.0,15874000.0
Other non-current assets,23598000.0,16567000.0,14822000.0,11798000.0,9921000.0,6131000.0,5455000.0,6149000.0,5987000.0,5456000.0,...,2079000.0,3323000.0,1333000.0,1582000.0,632000.0,353000.0,2426000.0,1287000.0,1316000.0,1607000.0
Current assets,90067000.0,61806000.0,61897000.0,59664000.0,57689000.0,60239000.0,63278000.0,61185000.0,59940000.0,54975000.0,...,34421000.0,30722000.0,27878000.0,26555000.0,24356000.0,21132000.0,19352000.0,17993000.0,17331000.0,15338000.0
Stock,44949000.0,44435000.0,44269000.0,43783000.0,43046000.0,44469000.0,45141000.0,44858000.0,43803000.0,40714000.0,...,26612000.0,24401000.0,22614000.0,21442000.0,19793000.0,17076000.0,16497000.0,15897000.0,15989000.0,14064000.0
Debtors,6516000.0,6284000.0,6283000.0,5614000.0,5835000.0,5624000.0,6778000.0,6677000.0,6768000.0,5937000.0,...,1254000.0,1569000.0,2000000.0,1768000.0,1341000.0,1118000.0,976000.0,845000.0,853000.0,700000.0
Other current assets,38602000.0,11087000.0,11345000.0,10267000.0,8808000.0,10146000.0,11359000.0,9650000.0,9369000.0,8324000.0,...,6555000.0,4752000.0,3264000.0,3345000.0,3222000.0,2938000.0,1879000.0,1251000.0,489000.0,574000.0
Of which cash and cash equivalent,17741000.0,9465000.0,7722000.0,6456000.0,6602000.0,8343000.0,9135000.0,6627000.0,7066000.0,6003000.0,...,5199000.0,2736000.0,2161000.0,2054000.0,1856000.0,1879000.0,1447000.0,883000.0,83000.0,45000.0
Total assets,252496000.0,236495000.0,219295000.0,204522000.0,198825000.0,199581000.0,203490000.0,204751000.0,203105000.0,193406000.0,...,105405000.0,94808000.0,83527000.0,78130000.0,70349000.0,49996000.0,45384000.0,39604000.0,37541000.0,32819000.0


In [103]:

from sklearn.linear_model import LinearRegression

# Forecast years and base year definition
forecast_years = ['2022F', '2023F', '2024F', '2025F', '2026F', '2027F']
historical_years = df.columns[df.columns.str.isnumeric()]

# Forecast function
def forecast_metric(values, years, forecast_years):
    years_reshaped = years.values.reshape(-1, 1)
    values_reshaped = values.values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(years_reshaped, values_reshaped)
    future_years = pd.Series(forecast_years).str[:-1].astype(int).values.reshape(-1, 1)
    predictions = model.predict(future_years).flatten()
    return pd.Series(predictions, index=forecast_years)

# Apply forecasting
forecasted_values = df.apply(lambda x: forecast_metric(x, historical_years, forecast_years), axis=1)
cagr_values = (forecasted_values[forecast_years].astype(float).iloc[:, -1] / forecasted_values[forecast_years].astype(float).iloc[:, 0]) ** (1 / (len(forecast_years) - 1)) - 1
# Print results
forecasted_values,cagr_values


(                                                  2022F         2023F  \
 Financial Metric                                                        
 Non-current assets                         1.852829e+08  1.915092e+08   
 Intangible assets                          2.845098e+07  2.950043e+07   
 Tangible fixed assets                      1.434989e+08  1.481080e+08   
 Other non-current assets                   1.333308e+07  1.390081e+07   
 Current assets                             7.615254e+07  7.843727e+07   
 Stock                                      5.124477e+07  5.260874e+07   
 Debtors                                    7.464635e+06  7.736845e+06   
 Other current assets                       1.744314e+07  1.809169e+07   
 Of which cash and cash equivalent          1.130738e+07  1.171686e+07   
 Total assets                               2.614355e+08  2.699465e+08   
 Shareholders funds                         9.470595e+07  9.760225e+07   
 Capital                              

## Forecast Using Ratios of Non Base Financial Metrics to Base Metrics

In [25]:
# from sklearn.linear_model import LinearRegression
# import numpy as np
# #Assuming 'financial_data' is your DataFrame
# if df.index.name != 'Financial Metric':
#     dfcopy=df
#     df.set_index('Financial Metric', inplace=True)  # Setting 'Financial Metric' as index

# # Calculate historical ratios
# historical_ratios = {}
# for metric, base in base_mapping.items():
#     print(metric,base)
#     if metric in df.index and base in df.index:
#         historical_ratios[metric] = df.loc[metric] / df.loc[base]
# # Example of performing linear regression on one of the metrics

# print(historical_ratios)
# model = LinearRegression()
# if 'Operating profit (loss) [EBIT]' in historical_ratios:
#     ratios = historical_ratios['Operating profit (loss) [EBIT]'].dropna()  # Drop NaN to avoid fitting errors
#     years = np.array([int(year) for year in ratios.index]).reshape(-1, 1)
#     model.fit(years, ratios.values.reshape(-1, 1))
#     # Predict for a future year, e.g., 2022
#     future_year = np.array([[2022]])
#     forecast_ratio = model.predict(future_year)
#     print(f"Forecasted Ratio for 2022: {forecast_ratio[0][0]}")

# # Use similar logic for other metrics


In [None]:
for metric, ratios in historical_ratios.items():
    print(ratios.index.tolist(), ratios.values.tolist())

In [104]:

# Now apply these dynamic bases to your regression forecasts
future_bases = {
    'Sales': forecasted_values.loc['Sales'],
    'Total assets': forecasted_values.loc['Total assets']  # Assuming 'Total assets' forecast is calculated similarly
}
print(future_bases)
#Calculate historical ratios for regression
historical_ratios = {}
for metric, base in base_mapping.items():
    historical_ratios[metric] = df.loc[metric] / df.loc[base]

historical_ratios=pd.DataFrame.from_dict(historical_ratios)
historical_ratios=historical_ratios.dropna(axis='columns')


# Perform regression and forecast future ratios
forecasts = {}
projected_ratios = {}

yeardf = pd.DataFrame(df.columns)

for metric, ratios in historical_ratios.items():
    model = LinearRegression()
    X = yeardf
    y = ratios
    model.fit(X, y)


    slope = model.coef_[0]
    intercept = model.intercept_
    print(f"The {metric} model is: Value = {slope:.10f} * Year + {intercept:.5f}")
    
    # Create X and y for model fitting
    for year in forecast_years:
        projected_ratio = model.predict(np.array([[int(year[:-1])]]))  # Forecast each future year
        forecasts[metric, year] = projected_ratio * future_bases[base][year]
        projected_ratios[metric, year] = projected_ratio
        

# # Output the forecasted values
# print(future_bases)
# #forecasts
historical_ratios['Costs of employees'], projected_ratios

{'Sales': 2022F    6.065818e+08
2023F    6.258407e+08
2024F    6.450997e+08
2025F    6.643586e+08
2026F    6.836175e+08
2027F    7.028764e+08
Name: Sales, dtype: float64, 'Total assets': 2022F    2.614355e+08
2023F    2.699465e+08
2024F    2.784575e+08
2025F    2.869686e+08
2026F    2.954796e+08
2027F    3.039906e+08
Name: Total assets, dtype: float64}
The Non-current assets model is: Value = 0.0058262648 * Year + -11.03207
The Intangible assets model is: Value = 0.0032323505 * Year + -6.40610
The Tangible fixed assets model is: Value = 0.0012995196 * Year + -2.06065
The Other non-current assets model is: Value = 0.0012943947 * Year + -2.56533
The Current assets model is: Value = -0.0058262648 * Year + 12.03207
The Other current assets model is: Value = 0.0014713789 * Year + -2.90143
The Total assets model is: Value = -0.0000000000 * Year + 1.00000
The Net current assets model is: Value = -0.0068841673 * Year + 13.81792
The Shareholders funds model is: Value = -0.0023855724 * Year + 5.

(2021    0.002091
 2020    0.001630
 2019    0.001503
 2018    0.001251
 2017    0.001227
 2016    0.000929
 2015    0.000951
 2014    0.001167
 2013    0.001197
 2012    0.001253
 2011    0.001315
 2010    0.001375
 2009    0.001399
 2008    0.001504
 2007    0.001661
 2006    0.001831
 2005    0.002005
 2004    0.002224
 2003    0.002478
 2002    0.002839
 2001    0.002990
 2000    0.003406
 1999    0.004157
 1998    0.005155
 1997    0.005792
 1996    0.006473
 1995    0.007372
 Name: Costs of employees, dtype: float64,
 {('Non-current assets', '2022F'): array([0.74863305]),
  ('Non-current assets', '2023F'): array([0.75445932]),
  ('Non-current assets', '2024F'): array([0.76028558]),
  ('Non-current assets', '2025F'): array([0.76611185]),
  ('Non-current assets', '2026F'): array([0.77193811]),
  ('Non-current assets', '2027F'): array([0.77776438]),
  ('Intangible assets', '2022F'): array([0.12971392]),
  ('Intangible assets', '2023F'): array([0.13294627]),
  ('Intangible assets', '

In [35]:
# df = financial_data.copy()

# if df.index.name != 'Financial Metric':
#     dfcopy=df
#     df.set_index('Financial Metric', inplace=True)  # Setting 'Financial Metric' as index

# df.index=df.index.str.strip()
# #df.fillna(method='ffill', inplace=True)
# #df.interpolate(method='linear', axis=1, inplace=True)

# # def controlled_gradient_interpolation(df):
# #     # Column-wise linear interpolation
# #     col_interp = df.interpolate(method='linear', axis=0)

# #     # Calculate relative gradients
# #     col_gradients = col_interp.diff().fillna(method='bfill').fillna(method='ffill')
# #     col_gradients /= col_interp  # Normalize gradients by the value
    
# #     # Damping factor
# #     damping_factor = 0.5

# #     # Row-wise linear interpolation
# #     row_interp = df.interpolate(method='linear', axis=1)
    
# #     # Adjust row interpolation with controlled column gradients
# #     for column in df.columns:
# #         if df[column].isna().any():
# #             # Only adjust NaN entries
# #             nan_indices = df[column].index[df[column].isna()]
# #             for idx in nan_indices:
# #                 if idx in row_interp.index:
# #                     known_idx = df[column].notna().idxmax()  # Index of first non-NaN value
# #                     distance = (df.index.get_loc(idx) - df.index.get_loc(known_idx))
# #                     adjustment = col_gradients.at[idx, column] * distance * damping_factor
# #                     row_interp.at[idx, column] += adjustment

# #     return row_interp

# # # Apply the custom function
# # df = controlled_gradient_interpolation(df)



# df.loc['Costs of employees'], df.loc['Number of employees']
# #financial_data
# df = df.T
# df

## Time Series Regression

## Random Forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Assuming df is your DataFrame

# Impute missing values across the DataFrame
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)

# Print how many were originally missing
print("Missing 'Costs of employees' before imputation:", df['Costs of employees'].isnull().sum())

# Scaling the entire DataFrame
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df.columns, index=df.index)

# Specify the features and target explicitly
features = ['Total assets', 'Sales', 'Operating revenue (Turnover)', 'Added value']  # Example feature set
target = 'Costs of employees'

# Check if there are any missing values left in the target after imputation
print("Missing 'Costs of employees' after imputation:", df_scaled[target].isnull().sum())

# Filter out rows where target is not NaN to train the model
train_data = df_scaled[df_scaled[target].notna()]
X_train = train_data[features]
y_train = train_data[target]

# Fit the model
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

# Identify rows where the target is NaN to apply predictions
test_data = df_scaled[df_scaled[target].isnull()]
X_test = test_data[features]

# Check how many rows are available for prediction
print("Rows available for prediction:", len(X_test))

# Condition to prevent model prediction error
if not X_test.empty:
    # Predict missing values
    predicted_costs_scaled = model.predict(X_test)

    # Inverse transform the scaled predictions back to original scale
    predicted_costs = scaler.inverse_transform(predicted_costs_scaled.reshape(-1, 1)).flatten()

    # Fill in the missing values in the original DataFrame
    df.loc[df[target].isnull(), target] = predicted_costs
    print("Missing values filled.")
else:
    print("No missing values to fill.")


## Costs of employees only

In [None]:
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer

# # Assuming df is your DataFrame

# # Impute missing values for selected columns only
# columns_to_impute = df.columns.difference(['Costs of employees'])  # All columns except 'Costs of employees'
# imputer = SimpleImputer(strategy='mean')
# df_imputed = df.copy()
# df_imputed[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

# # Scaling the entire DataFrame
# scaler = StandardScaler()
# df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df.columns, index=df.index)

# # Specify the features and target explicitly
# features = ['Total assets', 'Sales', 'Operating revenue (Turnover)', 'Added value']  # Example feature set
# target = 'Costs of employees'

# # Filter out rows where target is not NaN to train the model
# train_data = df_scaled[df_scaled[target].notna()]
# X_train = train_data[features]
# y_train = train_data[target]

# # Fit the model
# model = RandomForestRegressor(random_state=0)
# model.fit(X_train, y_train)

# # Identify rows where the target is NaN to apply predictions
# test_data = df_scaled[df_scaled[target].isnull()]
# X_test = test_data[features]

# # Check how many rows are available for prediction
# print("Rows available for prediction:", len(X_test))

# # Predict missing values if available
# if not X_test.empty:
#     predicted_costs_scaled = model.predict(X_test)
#     predicted_costs = scaler.inverse_transform(predicted_costs_scaled.reshape(-1, 1)).flatten()
#     df.loc[df[target].isnull(), target] = predicted_costs
#     print("Missing values filled.")
# else:
#     print("No missing values to fill.")


## Model trained NaN fill

In [None]:
# import pandas as pd
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.preprocessing import StandardScaler

# # Assuming df is your DataFrame loaded with data

# # Identify columns with missing values
# cols_with_missing = df.columns[df.isnull().any()].tolist()

# # Decide on an imputation strategy for each column
# # Example: Use mean for numerical data and a model for 'Costs of employees'
# imputation_strategies = {
#     col: ('model' if df[col].dtype.kind in 'biufc' else 'model')
#     for col in cols_with_missing if col != 'Costs of employees'
# }
# imputation_strategies['Costs of employees'] = 'model'  # Deciding to use a model for 'Costs of employees'

# # Apply imputation or model prediction
# for col, strategy in imputation_strategies.items():
#     if strategy != 'model':
#         # Simple imputation
#         imputer = SimpleImputer(strategy=strategy)
#         df[col] = imputer.fit_transform(df[[col]])
#     else:
#     # Setup for predictive modeling
#     # Assuming you've already identified features to use
#         features = df.columns.difference([col] + cols_with_missing).tolist()

#     # Ensure the current column is not in the features
#         if col in features:
#             features.remove(col)

#     # Filter data to exclude rows where the target or features have missing values
#         training_data = df.dropna(subset=[col] + features)

#     # Prepare target and features
#         train_features = training_data[features]
#         target = training_data[col]

#     # Scaling features
#         scaler = StandardScaler()
#         train_features_scaled = scaler.fit_transform(train_features)

#     # Model fitting
#         model = RandomForestRegressor(random_state=0)
#         model.fit(train_features_scaled, target)

#     # Prepare features for prediction
#         mask = df[col].isnull()  # Rows where the target is missing
#         test_features = df.loc[mask, features]
#         test_features_scaled = scaler.transform(test_features)

#     # Predicting missing values
#         predicted_values = model.predict(test_features_scaled)

#     # Fill in the missing values
#         df.loc[mask, col] = predicted_values


# print("Missing values handled for columns:", cols_with_missing)
# print(df.info())