# Cleaning Financial Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df_revenue = pd.read_excel('data/total_revenue.xlsx')
df_ebitda = pd.read_excel('data/ebitda.xlsx')
df_cogs = pd.read_excel('data/cogs.xlsx')

df_cogs

Unnamed: 0,Company Name,CY2016 Cost Of Goods Sold,CY2017 Cost Of Goods Sold,CY2018 Cost Of Goods Sold,CY2019 Cost Of Goods Sold,CY2020 Cost Of Goods Sold,CY2021 Cost Of Goods Sold,CY2022 Cost Of Goods Sold
0,ABB Ltd (SWX:ABBN),17270,17278,19059,19018,18123,19407,19712
1,"Amazon.com, Inc. (NasdaqGS:AMZN)",88265,111934,139156,165536,-,272344,288831
2,"Cisco Systems, Inc. (NasdaqGS:CSCO)",17663,18121,19225,18575,17206,18918,20242
3,Intel Corporation (NasdaqGS:INTC),22767,23663,27111,29825,34255,35209,36188
4,Kyocera Corporation (TSE:6971),6945.44,7520.28,8239.57,7702.25,7392.65,8551.53,9614.2
...,...,...,...,...,...,...,...,...
267,"Veoneer, Inc.",1794,1857,1798,1591,1191,1384,-
268,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",1266.49,2175.57,2349.93,2532.39,3970.87,5344.51,5038.19
269,Giesecke+Devrient Currency Technology Gb Ltd,-,-,-,-,-,-,-
270,Lutron Electronics Gmbh,-,-,-,-,-,-,-


In [None]:
na_revenue = (df_revenue == "-").any(axis=1).sum()
na_ebitda = (df_ebitda == "-").any(axis=1).sum()
na_cogs = (df_cogs =="-").any(axis=1).sum()

na_cogs-na_revenue

In [3]:
 # Create a Boolean mask for rows with "-" values
mask_revenue = (df_revenue == "-").any(axis=1)
mask_ebitda = (df_ebitda == "-").any(axis=1)
mask_cogs = (df_cogs == "-").any(axis=1)

sum(mask_cogs!=mask_revenue)

36

In [4]:
# Drop the rows with "-" values from the original DataFrame
df_revenue = df_revenue[~mask_cogs]
df_ebitda = df_ebitda[~mask_cogs]
df_cogs = df_cogs[~mask_cogs]

In [5]:
mask_ebitda = (df_ebitda == "-").any(axis=1)
df_revenue = df_revenue[~mask_ebitda]
df_ebitda = df_ebitda[~mask_ebitda]
df_cogs = df_cogs[~mask_ebitda]
#Save the new clean df in the directory
df_revenue.to_csv('clean_df_revenue.csv', index=False)
df_ebitda.to_csv('clean_df_ebitda.csv', index=False)
df_cogs.to_csv('clean_df_cogs.csv', index=False)

In [6]:
new_column_names = ['Company name', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

# Assign new column names to the DataFrame
df_cogs.columns = new_column_names
df_cogs

Unnamed: 0,Company name,2016,2017,2018,2019,2020,2021,2022
0,ABB Ltd (SWX:ABBN),17270,17278,19059,19018,18123,19407,19712
2,"Cisco Systems, Inc. (NasdaqGS:CSCO)",17663,18121,19225,18575,17206,18918,20242
3,Intel Corporation (NasdaqGS:INTC),22767,23663,27111,29825,34255,35209,36188
4,Kyocera Corporation (TSE:6971),6945.44,7520.28,8239.57,7702.25,7392.65,8551.53,9614.2
5,Microsoft Corporation (NasdaqGS:MSFT),33446,35447,41339,43346,48510,57642,64984
...,...,...,...,...,...,...,...,...
260,Fortive Corporation (NYSE:FTV),2685.9,2832.7,1612.6,2066.9,2017.7,2244.8,2462.3
261,Xiaomi Corporation (SEHK:1810),8364.29,13598.16,20878.13,24235.77,28586.98,36917.05,31779.47
262,Adient plc (NYSE:ADNT),14992,15077,16464,15420,11948,12692,13507
266,"Resideo Technologies, Inc. (NYSE:REZI)",3090,3203,3404,3698,3727,4262,4604


In [7]:
new_df_cogs = []
for x in df_cogs["Company name"]:
    for y in list(df_cogs.columns)[1:]:
        row = [x, y, df_cogs[["Company name", y]][(df_cogs["Company name"] == x)].values[0][1]]
        new_df_cogs.append(row)

new_df_cogs = pd.DataFrame(new_df_cogs, columns = ["Company", "Year", "Value"])
new_df_cogs       

Unnamed: 0,Company,Year,Value
0,ABB Ltd (SWX:ABBN),2016,17270.00
1,ABB Ltd (SWX:ABBN),2017,17278.00
2,ABB Ltd (SWX:ABBN),2018,19059.00
3,ABB Ltd (SWX:ABBN),2019,19018.00
4,ABB Ltd (SWX:ABBN),2020,18123.00
...,...,...,...
1234,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2018,2349.93
1235,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2019,2532.39
1236,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2020,3970.87
1237,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2021,5344.51


In [8]:
new_df_cogs['Year'] = new_df_cogs['Year'].astype(int)  # Convert 'Year' to integers
new_df_cogs['Value'] = pd.to_numeric(new_df_cogs['Value'], errors='coerce')  # Convert 'Value' to numeric, handle non-numeric values as NaN

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import preprocessing
from scipy import stats
import statsmodels.api as sm

In [11]:
grouped = new_df_cogs.groupby('Company')

In [12]:
# Iterate over each group and create a regression model
ols_models = {}
for name, group in grouped:
    X = sm.add_constant(group['Year'])  # Add a constant term for the intercept
    y = group['Value']

    # Fit the OLS model
    model = sm.OLS(y, X).fit()

    # Store the model in the dictionary with the company name as the key
    ols_models[name] = model

In [23]:
companies = df_cogs["Company name"]
companies = companies.tolist()

['ABB Ltd (SWX:ABBN)',
 'Cisco Systems, Inc. (NasdaqGS:CSCO)',
 'Intel Corporation (NasdaqGS:INTC)',
 'Kyocera Corporation (TSE:6971)',
 'Microsoft Corporation (NasdaqGS:MSFT)',
 'Oracle Corporation (NYSE:ORCL)',
 'Roper Technologies, Inc. (NasdaqGS:ROP)',
 'Sony Group Corporation (TSE:6758)',
 'Thales S.A. (ENXTPA:HO)',
 'Apple Inc. (NasdaqGS:AAPL)',
 'ASUSTeK Computer Inc. (TWSE:2357)',
 'BorgWarner Inc. (NYSE:BWA)',
 'Ciena Corporation (NYSE:CIEN)',
 'CommScope Holding Company, Inc. (NasdaqGS:COMM)',
 'Alphabet Inc. (NasdaqGS:GOOGL)',
 'Intuitive Surgical, Inc. (NasdaqGS:ISRG)',
 'Itron, Inc. (NasdaqGS:ITRI)',
 'Juniper Networks, Inc. (NYSE:JNPR)',
 'Lam Research Corporation (NasdaqGS:LRCX)',
 'Lear Corporation (NYSE:LEA)',
 'Mattel, Inc. (NasdaqGS:MAT)',
 'Medtronic plc (NYSE:MDT)',
 'Mettler-Toledo International Inc. (NYSE:MTD)',
 'AMETEK, Inc. (NYSE:AME)',
 'NEC Corporation (TSE:6701)',
 'NetApp, Inc. (NasdaqGS:NTAP)',
 'Textron Inc. (NYSE:TXT)',
 'Westinghouse Air Brake Technolo

In [26]:
new_cogs = []
for company, group in grouped:
    model = ols_models.get(company)
    year_prediction = []
    for year in [2023, 2024, 2025, 2026, 2027]:
        prediction = model.predict([1, year])  # Use .predict() to make predictions
        year_prediction.append(prediction[0])  # The prediction is a one-element array, so extract the value
    new_cogs.append(year_prediction)

new_cogs = pd.DataFrame(new_cogs, columns = ["2023", "2024", "2025", "2026", "2027"])
new_cogs.insert(0, "Company name", companies)
new_cogs

Unnamed: 0,Company name,2023,2024,2025,2026,2027
0,ABB Ltd (SWX:ABBN),20073.571429,20453.857143,20834.142857,21214.428571,21594.714286
1,"Cisco Systems, Inc. (NasdaqGS:CSCO)",4047.158571,4251.738929,4456.319286,4660.899643,4865.480000
2,Intel Corporation (NasdaqGS:INTC),11603.825714,12707.399643,13810.973571,14914.547500,16018.121429
3,Kyocera Corporation (TSE:6971),13105.228571,13559.108571,14012.988571,14466.868571,14920.748571
4,Microsoft Corporation (NasdaqGS:MSFT),7569.530000,7371.533571,7173.537143,6975.540714,6777.544286
...,...,...,...,...,...,...
172,Fortive Corporation (NYSE:FTV),41278.860000,45728.580357,50178.300714,54628.021071,59077.741429
173,Xiaomi Corporation (SEHK:1810),3608.428571,3752.678571,3896.928571,4041.178571,4185.428571
174,Adient plc (NYSE:ADNT),1504.857143,1504.372500,1503.887857,1503.403214,1502.918571
175,"Resideo Technologies, Inc. (NYSE:REZI)",3289.142857,3498.321429,3707.500000,3916.678571,4125.857143


In [28]:
merged_cogs = pd.merge(df_cogs, new_cogs, on='Company name', how='inner')
merged_cogs

Unnamed: 0,Company name,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027
0,ABB Ltd (SWX:ABBN),17270,17278,19059,19018,18123,19407,19712,20073.571429,20453.857143,20834.142857,21214.428571,21594.714286
1,"Cisco Systems, Inc. (NasdaqGS:CSCO)",17663,18121,19225,18575,17206,18918,20242,4047.158571,4251.738929,4456.319286,4660.899643,4865.480000
2,Intel Corporation (NasdaqGS:INTC),22767,23663,27111,29825,34255,35209,36188,11603.825714,12707.399643,13810.973571,14914.547500,16018.121429
3,Kyocera Corporation (TSE:6971),6945.44,7520.28,8239.57,7702.25,7392.65,8551.53,9614.2,13105.228571,13559.108571,14012.988571,14466.868571,14920.748571
4,Microsoft Corporation (NasdaqGS:MSFT),33446,35447,41339,43346,48510,57642,64984,7569.530000,7371.533571,7173.537143,6975.540714,6777.544286
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,Fortive Corporation (NYSE:FTV),2685.9,2832.7,1612.6,2066.9,2017.7,2244.8,2462.3,41278.860000,45728.580357,50178.300714,54628.021071,59077.741429
173,Xiaomi Corporation (SEHK:1810),8364.29,13598.16,20878.13,24235.77,28586.98,36917.05,31779.47,3608.428571,3752.678571,3896.928571,4041.178571,4185.428571
174,Adient plc (NYSE:ADNT),14992,15077,16464,15420,11948,12692,13507,1504.857143,1504.372500,1503.887857,1503.403214,1502.918571
175,"Resideo Technologies, Inc. (NYSE:REZI)",3090,3203,3404,3698,3727,4262,4604,3289.142857,3498.321429,3707.500000,3916.678571,4125.857143


### Final cogs data set, by company and by year with ols forecasts till 2027

In [29]:
cogs_final = []

for x in merged_cogs["Company name"]:
    for y in list(merged_cogs.columns)[1:]:
        row = [x, y, merged_cogs[["Company name", y]][(merged_cogs["Company name"] == x)].values[0][1]]
        cogs_final.append(row)

cogs_final = pd.DataFrame(cogs_final, columns = ["Company", "Year", "Value"])
cogs_final     

Unnamed: 0,Company,Year,Value
0,ABB Ltd (SWX:ABBN),2016,17270.000000
1,ABB Ltd (SWX:ABBN),2017,17278.000000
2,ABB Ltd (SWX:ABBN),2018,19059.000000
3,ABB Ltd (SWX:ABBN),2019,19018.000000
4,ABB Ltd (SWX:ABBN),2020,18123.000000
...,...,...,...
2119,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2023,35119.290000
2120,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2024,35600.404286
2121,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2025,36081.518571
2122,"Shenzhen Transsion Holdings Co., Ltd. (SHSE:68...",2026,36562.632857


In [33]:
cogs_final.to_csv('cogs_final.csv', index=False)