# Clean the S&P financial dataset

In [2]:
# Import necessary packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import preprocessing
from scipy import stats
import statsmodels.api as sm

## Import and read financial data

In [3]:
# Specify the path to the financials Excel file
excel_file = "../input_data/financials.xls"

In [4]:
# Read the 'TOTAL REVENUE' sheet from the Excel file
df_revenue = pd.read_excel(excel_file, sheet_name="TOTAL REVENUE", header=13, usecols="A:M")

# Read the 'EBITDA' sheet from the Excel file
df_ebitda = pd.read_excel(excel_file, sheet_name="EBITDA", header=13, usecols="A:M")

# Read the 'Cost of Goods Sold' sheet from the Excel file
df_cogs = pd.read_excel(excel_file, sheet_name="Cost of Goods Sold", header=13, usecols="A:H")

## Identify and remove missing values

In [5]:
# Identify rows with missing values in the COGS sheet
mask_cogs = (df_cogs == "-").any(axis=1)

In [6]:
# Retrieve rows with missing values in the COGS sheet
filtered_rows_cogs = df_cogs[mask_cogs]
company_names_cogs = filtered_rows_cogs['Company Name']

In [7]:
# Remove rows with missing values in COGS from all datasets
df_revenue = df_revenue[~mask_cogs]
df_ebitda = df_ebitda[~mask_cogs]
df_cogs = df_cogs[~mask_cogs]

In [8]:
# Identify rows in the EBITDA sheet with missing values before 2023
mask_ebitda = (df_ebitda.iloc[:, :8] == "-").any(axis=1)
filtered_rows_ebitda = df_ebitda[mask_ebitda]
company_names_ebitda = filtered_rows_ebitda['Company Name']

In [9]:
# Remove these rows from all datasets
df_revenue = df_revenue[~mask_ebitda]
df_ebitda = df_ebitda[~mask_ebitda]
df_cogs = df_cogs[~mask_ebitda]

## Revenue - use OLS to predict missing values for 2023-2027

In [10]:
# Set column names for the revenue dataframe
col_names = ['Company name', '2016', '2017', '2018', '2019', '2020', '2021', '2022', "2023", "2024", "2025", "2026", "2027"]
df_revenue.columns = col_names

In [11]:
# Reshape revenue dataframe for OLS regression
ols_revenue_data = df_revenue[['Company name', '2016', '2017', '2018', '2019', '2020', '2021', '2022']]
new_df_revenue = []
for company in ols_revenue_data["Company name"]:
    for year in list(ols_revenue_data.columns)[1:]:
        value = ols_revenue_data.loc[ols_revenue_data["Company name"] == company, year].values[0]
        new_df_revenue.append([company, year, value])

new_df_revenue = pd.DataFrame(new_df_revenue, columns=["Company", "Year", "Revenue"])

In [12]:
# Prepare data for OLS regression
new_df_revenue['Year'] = new_df_revenue['Year'].astype(int)
new_df_revenue['Value'] = pd.to_numeric(new_df_revenue['Revenue'], errors='coerce')
grouped_rev = new_df_revenue.groupby('Company')

In [13]:
# Fit OLS model for each company and predict revenue from 2023-2027
ols_models = {}
for name, group in grouped_rev:
    X = sm.add_constant(group['Year'])
    y = group['Revenue']
    model = sm.OLS(y, X).fit()
    ols_models[name] = model

In [14]:
# Predict future revenue using the OLS models
companies = df_revenue["Company name"].tolist()
new_rev = []
for company in companies:
    model = ols_models.get(company)
    year_prediction = [model.predict([1, year])[0] for year in [2023, 2024, 2025, 2026, 2027]]
    new_rev.append(year_prediction)

In [15]:
# Create a dataframe for predicted revenue
new_rev_df = pd.DataFrame(new_rev, columns=["2023", "2024", "2025", "2026", "2027"])
new_rev_df.insert(0, "Company name", companies)

In [16]:
# Merge original and predicted revenue data
merged_rev = pd.merge(ols_revenue_data, new_rev_df, on='Company name', how='inner')

In [17]:
# Create final revenue dataframe with all OLS predictions
df_revenue_complete = []
for company in merged_rev["Company name"]:
    for year in list(merged_rev.columns)[1:]:
        value = merged_rev.loc[merged_rev["Company name"] == company, year].values[0]
        df_revenue_complete.append([company, year, value])

df_revenue_complete = pd.DataFrame(df_revenue_complete, columns=["Company", "Year", "Revenue"])

In [18]:
# Create the actual dataframe with missing values filled in
df_revenue_real = []
for company in df_revenue["Company name"]:
    for year in list(df_revenue.columns)[1:]:
        value = df_revenue.loc[df_revenue["Company name"] == company, year].values[0]
        df_revenue_real.append([company, year, value])

df_revenue_real = pd.DataFrame(df_revenue_real, columns=["Company", "Year", "Revenue"])

In [19]:
# Substitute missing values in the initial dataframe with OLS predictions
for i in range(len(df_revenue_real["Revenue"])):
    if df_revenue_real["Revenue"][i] == "-":
        df_revenue_real["Revenue"][i] = df_revenue_complete.loc[i, "Revenue"]

In [20]:
# Export the final revenue data to a CSV file
df_revenue_real.to_csv('../clean_data/revenue_complete.csv', index=False)

## EBITDA - use OLS to predict missing values for 2023-2027

In [21]:
# Rename columns for the EBITDA dataframe
df_ebitda.columns = col_names

In [22]:
# Reshape EBITDA dataframe for OLS regression
# This creates a similar structure to ols_revenue_data for EBITDA
ols_ebitda_data = df_ebitda[['Company name', '2016', '2017', '2018', '2019', '2020', '2021', '2022']]
new_df_ebitda = []
for company in ols_ebitda_data["Company name"]:
    for year in list(ols_ebitda_data.columns)[1:]:
        value = ols_ebitda_data.loc[ols_ebitda_data["Company name"] == company, year].values[0]
        new_df_ebitda.append([company, year, value])
new_df_ebitda = pd.DataFrame(new_df_ebitda, columns=["Company", "Year", "Ebitda"])

In [23]:
# Prepare EBITDA data for OLS regression
new_df_ebitda['Year'] = new_df_ebitda['Year'].astype(int)
new_df_ebitda['Ebitda'] = pd.to_numeric(new_df_ebitda['Ebitda'], errors='coerce')
grouped_ebi = new_df_ebitda.groupby('Company')

In [24]:
# Fit OLS model for each company in the EBITDA data
ols_models_ebi = {}
for name, group in grouped_ebi:
    X = sm.add_constant(group['Year'])
    y = group['Ebitda']
    model = sm.OLS(y, X).fit()
    ols_models_ebi[name] = model

In [25]:
# Predict EBITDA from 2023-2027 using the OLS model
new_ebi = []
for company in companies:  # Reuse the company list from revenue section
    model = ols_models_ebi.get(company)
    year_prediction = [model.predict([1, year])[0] for year in [2023, 2024, 2025, 2026, 2027]]
    new_ebi.append(year_prediction)

In [26]:
# Create a dataframe with the predicted EBITDA
new_ebi_df = pd.DataFrame(new_ebi, columns=["2023", "2024", "2025", "2026", "2027"])
new_ebi_df.insert(0, "Company name", companies)

In [27]:
# Merge the original and predicted EBITDA data
merged_ebi = pd.merge(ols_ebitda_data, new_ebi_df, on='Company name', how='inner')

In [28]:
# Create EBITDA dataframe with all OLS predictions for 2023-2027
df_ebitda_complete = []
for company in merged_ebi["Company name"]:
    for year in list(merged_ebi.columns)[1:]:
        value = merged_ebi.loc[merged_ebi["Company name"] == company, year].values[0]
        df_ebitda_complete.append([company, year, value])

df_ebitda_complete = pd.DataFrame(df_ebitda_complete, columns=["Company", "Year", "Ebitda"])

In [29]:
# Create the actual EBITDA dataframe with missing values filled in
df_ebitda_real = []
for company in df_ebitda["Company name"]:
    for year in list(df_ebitda.columns)[1:]:
        value = df_ebitda.loc[df_ebitda["Company name"] == company, year].values[0]
        df_ebitda_real.append([company, year, value])

df_ebitda_real = pd.DataFrame(df_ebitda_real, columns=["Company", "Year", "Ebitda"])

In [30]:
# Substitute missing values in the initial EBITDA dataframe with OLS predictions
for i in range(len(df_ebitda_real["Ebitda"])):
    if df_ebitda_real["Ebitda"][i] == "-":
        df_ebitda_real["Ebitda"][i] = df_ebitda_complete.loc[i, "Ebitda"]

In [31]:
# Export the final EBITDA data to a CSV file
df_ebitda_real.to_csv('../clean_data/ebitda_complete.csv', index=False)

## COGS - use OLS to predict values for 2023-2027

In [32]:
# Define new column names for the COGS dataframe, indicating the years of available data
new_column_names = ['Company name', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

# Assign the new column names to the COGS dataframe
df_cogs.columns = new_column_names

In [33]:
# Initialize an empty list to hold the reshaped COGS data
new_df_cogs = []

# Loop through each company in the COGS dataframe
for company in df_cogs["Company name"]:
    # Loop through each year column for the company
    for year in list(df_cogs.columns)[1:]:
        # Extract the value for that year and company
        # Since we're only interested in the value, we should not use [0][1], but just [0]
        value = df_cogs.loc[df_cogs["Company name"] == company, year].values[0]
        # Append the company, year, and value to the new COGS data list
        new_df_cogs.append([company, year, value])

In [34]:
# Convert the list to a DataFrame with appropriate column names
new_df_cogs = pd.DataFrame(new_df_cogs, columns=["Company", "Year", "Value"])

# Convert 'Year' to integers for regression analysis
new_df_cogs['Year'] = new_df_cogs['Year'].astype(int)

# Convert 'Value' to numeric and handle non-numeric values as NaN
new_df_cogs['Value'] = pd.to_numeric(new_df_cogs['Value'], errors='coerce')

# Group the data by 'Company' for individual regression models
grouped = new_df_cogs.groupby('Company')

In [35]:
# Initialize a dictionary to store OLS models for each company
ols_models = {}

# Fit an OLS model for each company
for name, group in grouped:
    # Add a constant term to the independent variable for the intercept
    X = sm.add_constant(group['Year'])
    # Set the dependent variable as the 'Value'
    y = group['Value']
    # Fit the OLS model and store it in the dictionary using the company name as the key
    model = sm.OLS(y, X).fit()
    ols_models[name] = model

In [36]:
# Get a list of unique company names
companies = df_cogs["Company name"].tolist()

# Initialize a list to store the predicted COGS for each company
new_cogs = []

# Predict the COGS for the years 2023 to 2027 using the OLS models
for company in companies:
    model = ols_models.get(company)
    year_prediction = []
    for year in [2023, 2024, 2025, 2026, 2027]:
        # Predict the value for each year and append it to the list
        prediction = model.predict([1, year])
        year_prediction.append(prediction[0])  # Extract the value from the prediction
    new_cogs.append(year_prediction)

In [37]:
# Convert the predictions list to a DataFrame
new_cogs = pd.DataFrame(new_cogs, columns=["2023", "2024", "2025", "2026", "2027"])

# Insert the 'Company name' column to the predictions DataFrame
new_cogs.insert(0, "Company name", companies)

# Merge the original COGS data with the new predictions
merged_cogs = pd.merge(df_cogs, new_cogs, on='Company name', how='inner')

In [38]:
# Initialize a list to hold the final structured COGS data
cogs_final = []

# Loop through the merged data to structure it in the final format
for company in merged_cogs["Company name"]:
    for year in list(merged_cogs.columns)[1:]:
        # Extract the value for the company and year
        # Corrected to access just the first element, which is the value
        value = merged_cogs.loc[merged_cogs["Company name"] == company, year].values[0]
        # Append the company, year, and value to the final COGS data list
        cogs_final.append([company, year, value])

# Convert the final COGS data list to a DataFrame
cogs_final = pd.DataFrame(cogs_final, columns=["Company", "Year", "Value"])

In [39]:
# Export the final structured COGS data to a CSV file
cogs_final.to_csv('../clean_data/cogs_complete_test.csv', index=False)

In [41]:
# Create a list of unique companies from the final COGS dataframe
unique_companies_new = cogs_final['Company'].unique()

# Create a dataframe of unique companies and export it to a CSV file
df_unique_companies = pd.DataFrame(unique_companies_new, columns=['Company'])
df_unique_companies.to_csv('../clean_data/unique_companies.csv', index=False)