# Get the full dataset

### import

In [2]:
import pandas as pd
import numpy as np

## Merging both dataset

In [3]:
cleaned_data = pd.read_csv("../clean_data/cleaned_spend_data.csv", index_col = 0)
cleaned_data = cleaned_data.groupby(["CompanyName", "Device", "Market", " SubMarket", "PeriodCode"]).sum().reset_index()
cleaned_data.rename(columns = {" SubMarket" : "SubMarket", "PeriodCode" : "Year"}, inplace = True)
cleaned_data.Year = cleaned_data.Year.apply( lambda x: int(x[1:]))
#cleaned_data.drop("CompanyName", axis = 1, inplace = True)
cleaned_data

Unnamed: 0,CompanyName,Device,Market,SubMarket,Year,Value
0,ABB,Amplifier/Comparator,Industrial,Manufacturing Equipment,2018,7.951500
1,ABB,Amplifier/Comparator,Industrial,Manufacturing Equipment,2019,8.616500
2,ABB,Amplifier/Comparator,Industrial,Manufacturing Equipment,2020,8.565350
3,ABB,Amplifier/Comparator,Industrial,Manufacturing Equipment,2021,12.353845
4,ABB,Amplifier/Comparator,Industrial,Manufacturing Equipment,2022,13.879158
...,...,...,...,...,...,...
36010,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2020,10.295800
36011,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2021,17.639800
36012,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2022,23.717500
36013,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2023,23.067700


In [4]:
market_spend = pd.read_csv("../input_data/market_data.csv")
market_spend

Unnamed: 0,Market,Submarket,Device,Year,Region,Market size
0,Automotive Electronics Categories,ADAS,Amplifier/Comparator,2008,Americas,0.424
1,Automotive Electronics Categories,ADAS,Amplifier/Comparator,2008,Asia & Oceania (excl. Japan),0.246
2,Automotive Electronics Categories,ADAS,Amplifier/Comparator,2008,EMEA,0.678
3,Automotive Electronics Categories,ADAS,Amplifier/Comparator,2008,Japan,0.421
4,Automotive Electronics Categories,ADAS,Amplifier/Comparator,2008,Worldwide,1.769
...,...,...,...,...,...,...
32495,Wireless Communications Categories,Wireless LAN Equipment,Voltage Regulator/Reference,2027,Americas,0.000
32496,Wireless Communications Categories,Wireless LAN Equipment,Voltage Regulator/Reference,2027,Asia & Oceania (excl. Japan),0.000
32497,Wireless Communications Categories,Wireless LAN Equipment,Voltage Regulator/Reference,2027,EMEA,0.000
32498,Wireless Communications Categories,Wireless LAN Equipment,Voltage Regulator/Reference,2027,Japan,0.000


In [5]:
mapping_dict = {('Automotive Electronics Categories', 'ADAS'): ('Automotive', 'Auto ADAS'),
    ('Automotive Electronics Categories', 'Body & Convenience'): ('Automotive', 'Other Auto & Aftermarket'),
    ('Automotive Electronics Categories', 'Chassis & Safety'): ('Automotive', 'Other Auto & Aftermarket'),
    ('Automotive Electronics Categories', 'Connectivity & Telematics'): ('Automotive', 'Other Auto & Aftermarket'),
    ('Automotive Electronics Categories', 'Hybrid & Electric Drive Train'): ('Automotive', 'Other Auto & Aftermarket'),
    ('Automotive Electronics Categories', 'Infotainment & Cluster'): ('Automotive', 'Auto Infotainment'),
    ('Automotive Electronics Categories', 'Other Automotive'): ('Automotive', 'Other Auto & Aftermarket'),
    ('Automotive Electronics Categories', 'Powertrain & Vehicle Dynamics'): ('Automotive', 'Auto Powertrain'),
    ('Computing & Data Storage Categories', 'Data Center Servers'): ('Computer Platforms', 'Data Center Servers'),
    ('Computing & Data Storage Categories', 'Desktop PCs'): ('Computer Platforms', 'Desktop PCs'),
    ('Computing & Data Storage Categories', 'Flash Storage Cards'): ('Computer Peripherals & Storage', 'Flash Cards/Drives'),
    ('Computing & Data Storage Categories', 'Hard Disk Drives'): ('Computer Peripherals & Storage', 'HDD'),
    ('Computing & Data Storage Categories', 'Notebook PCs'): ('Computer Platforms', 'Notebook PCs'),
    ('Computing & Data Storage Categories', 'Other Computing'): ('Computer Platforms', 'Other Computer Products'),
    ('Computing & Data Storage Categories', 'Other Data Storage'): ('Computer Peripherals & Storage', 'Other Storage'),
    ('Computing & Data Storage Categories', 'Other Peripherals'): ('Computer Peripherals & Storage', 'Other Peripherals'),
    ('Computing & Data Storage Categories', 'Smart Cards'): ('Computer Peripherals & Storage', 'Smart Cards'),
    ('Computing & Data Storage Categories', 'Solid-State Drives'): ('Computer Peripherals & Storage', 'Flash Cards/Drives'),
    ('Computing & Data Storage Categories', 'Tablet PCs'): ('Computer Platforms', 'Tablet PCs'),
    ('Computing & Data Storage Categories', 'USB Flash Drive'): ('Computer Peripherals & Storage', 'Flash Cards/Drives'),
    ('Consumer Electronics Categories', 'Fitness & Wellness Wearable Electronics'): ('Consumer', 'Other Consumer'),
    ('Consumer Electronics Categories', 'LCD TV'): ('Consumer', 'TV'),
    ('Consumer Electronics Categories', 'Major Home Appliances'): ('Consumer', 'Appliance'),
    ('Consumer Electronics Categories', 'OLED TV'): ('Consumer', 'TV'),
    ('Consumer Electronics Categories', 'Other Audio/Video'): ('Consumer', 'Audio'),
    ('Consumer Electronics Categories', 'Other Consumer Electronics'): ('Consumer', 'Other Consumer'),
    ('Consumer Electronics Categories', 'Set-Top Boxes'): ('Consumer', 'STB'),
    ('Consumer Electronics Categories', 'Smart Speakers & Digital Assistants'): ('Consumer', 'Connected Consumer'),
    ('Consumer Electronics Categories', 'Smart Watches'): ('Consumer', 'Other Consumer'),
    ('Consumer Electronics Categories', 'VR Headsets'): ('Consumer', 'Other Consumer'),
    ('Consumer Electronics Categories', 'Video Game Consoles'): ('Consumer', 'Video Games'),
    ('Industrial Electronics Categories', 'Automation'): ('Industrial', 'Other Industrial'),
    ('Industrial Electronics Categories', 'Building & Home Control'): ('Industrial', 'Other Industrial'),
    ('Industrial Electronics Categories', 'Lighting'): ('Industrial', 'Other Industrial'),
    ('Industrial Electronics Categories', 'Medical Electronics'): ('Industrial', 'Medical'),
    ('Industrial Electronics Categories', 'Military & Civil Aerospace'): ('Industrial', 'Military/Aerospace'),
    ('Industrial Electronics Categories', 'Other Industrial'): ('Industrial', 'Other Industrial'),
    ('Industrial Electronics Categories', 'Power & Energy'): ('Industrial', 'Power & Energy'),
    ('Industrial Electronics Categories', 'Security & Video Surveillance'): ('Industrial', 'Other Industrial'),
    ('Industrial Electronics Categories', 'Test & Measurement'): ('Industrial', 'Test & Measurement'),
    ('Wired Communications Categories', 'Broadcast & Streaming Video'): ('Wired Communications', 'Other Wired'),
    ('Wired Communications Categories', 'Cable Aggregation Equipment'): ('Wired Communications', 'Carrier'),
    ('Wired Communications Categories', 'Cable CPE'): ('Wired Communications', 'Carrier'),
    ('Wired Communications Categories', 'Carrier Ethernet Switches & Routers'): ('Wired Communications', 'Carrier'),
    ('Wired Communications Categories', 'DSL Aggregation Equipment'): ('Wired Communications', 'Carrier'),
    ('Wired Communications Categories', 'DSL CPE'): ('Wired Communications', 'Carrier'),
    ('Wired Communications Categories', 'Data Center Network Switches'): ('Wired Communications', 'DC Network & Threat Mitigation'),
    ('Wired Communications Categories', 'Enterprise Ethernet Switches & Routers'): ('Wired Communications', 'Enterprise/SOHO'),
    ('Wired Communications Categories', 'Enterprise UC & Voice'): ('Wired Communications', 'Enterprise/SOHO'),
    ('Wired Communications Categories', 'FTTH Aggregation Equipment'): ('Wired Communications', 'Carrier'),
    ('Wired Communications Categories', 'FTTH CPE'): ('Wired Communications', 'Carrier'),
    ('Wired Communications Categories', 'Low-Tier Consumer/SOHO Routers'): ('Wired Communications', 'Enterprise/SOHO'),
    ('Wired Communications Categories', 'Optical Equipment'): ('Wired Communications', 'Other Wired'),
    ('Wired Communications Categories', 'Other Wired Communications'): ('Wired Communications', 'Other Wired'),
    ('Wired Communications Categories', 'Threat Mitigation Products'): ('Wired Communications', 'DC Network & Threat Mitigation'),
    ('Wireless Communications Categories', 'Gray Market Handsets'): ('Wireless Communications', 'Handset'),
    ('Wireless Communications Categories', 'High-Tier Smartphone'): ('Wireless Communications', 'Handset'),
    ('Wireless Communications Categories', 'Low-Tier Smartphone'): ('Wireless Communications', 'Handset'),
    ('Wireless Communications Categories', 'M2M Modules'): ('Wireless Communications', 'Other Wireless'),
    ('Wireless Communications Categories', 'Media Tablets'): ('Wireless Communications', 'Media Tablets'),
    ('Wireless Communications Categories', 'Mid-Tier Smartphone'): ('Wireless Communications', 'Handset'),
    ('Wireless Communications Categories', 'Mobile Comm Infrastructure'): ('Wireless Communications', 'Infrastructure'),
    ('Wireless Communications Categories', 'Mobile Phone (ULCH, Entry, Feature)'): ('Wireless Communications', 'Handset'),
    ('Wireless Communications Categories', 'Other Wireless Communications'): ('Wireless Communications', 'Other Wireless'),
    ('Wireless Communications Categories', 'Wireless LAN Equipment'): ('Wireless Communications', 'Infrastructure')}

In [6]:
dict_map = {('Automotive Electronics Categories', 'ADAS'): ('Automotive', 'Auto ADAS'), ('Automotive Electronics Categories', 'Body & Convenience'): ('Automotive', 'Connected Consumer'), ('Automotive Electronics Categories', 'Chassis & Safety'): ('Automotive', 'Test & Measurement'), ('Automotive Electronics Categories', 'Connectivity & Telematics'): ('Automotive', 'Test & Measurement'), ('Automotive Electronics Categories', 'Hybrid & Electric Drive Train'): ('Automotive', 'DC Network & Threat Mitigation'), ('Automotive Electronics Categories', 'Infotainment & Cluster'): ('Automotive', 'Auto Infotainment'), ('Automotive Electronics Categories', 'Other Automotive'): ('Automotive', 'Other Storage'), ('Automotive Electronics Categories', 'Powertrain & Vehicle Dynamics'): ('Automotive', 'Power & Energy'), ('Computing & Data Storage Categories', 'Data Center Servers'): ('Computer Peripherals & Storage', 'Data Center Servers'), ('Computing & Data Storage Categories', 'Desktop PCs'): ('Computer Peripherals & Storage', 'Desktop PCs'), ('Computing & Data Storage Categories', 'Flash Storage Cards'): ('Computer Peripherals & Storage', 'Flash Cards/Drives'), ('Computing & Data Storage Categories', 'Hard Disk Drives'): ('Computer Peripherals & Storage', 'Flash Cards/Drives'), ('Computing & Data Storage Categories', 'Notebook PCs'): ('Computer Peripherals & Storage', 'Notebook PCs'), ('Computing & Data Storage Categories', 'Other Computing'): ('Computer Peripherals & Storage', 'Other Computer Products'), ('Computing & Data Storage Categories', 'Other Data Storage'): ('Computer Peripherals & Storage', 'Other Storage'), ('Computing & Data Storage Categories', 'Other Peripherals'): ('Computer Peripherals & Storage', 'Other Peripherals'), ('Computing & Data Storage Categories', 'Smart Cards'): ('Computer Peripherals & Storage', 'Smart Cards'), ('Computing & Data Storage Categories', 'Solid-State Drives'): ('Computer Peripherals & Storage', 'Data Center Servers'), ('Computing & Data Storage Categories', 'Tablet PCs'): ('Computer Peripherals & Storage', 'Tablet PCs'), ('Computing & Data Storage Categories', 'USB Flash Drive'): ('Computer Peripherals & Storage', 'Flash Cards/Drives'), ('Consumer Electronics Categories', 'Fitness & Wellness Wearable Electronics'): ('Computer Platforms', 'Test & Measurement'), ('Consumer Electronics Categories', 'LCD TV'): ('Computer Platforms', 'TV'), ('Consumer Electronics Categories', 'Major Home Appliances'): ('Computer Platforms', 'Appliance'), ('Consumer Electronics Categories', 'OLED TV'): ('Computer Platforms', 'TV'), ('Consumer Electronics Categories', 'Other Audio/Video'): ('Computer Platforms', 'Other Storage'), ('Consumer Electronics Categories', 'Other Consumer Electronics'): ('Computer Platforms', 'Other Consumer'), ('Consumer Electronics Categories', 'Set-Top Boxes'): ('Computer Platforms', 'Desktop PCs'), ('Consumer Electronics Categories', 'Smart Speakers & Digital Assistants'): ('Computer Platforms', 'Smart Cards'), ('Consumer Electronics Categories', 'Smart Watches'): ('Computer Platforms', 'Smart Cards'), ('Consumer Electronics Categories', 'VR Headsets'): ('Computer Platforms', 'Handset'), ('Consumer Electronics Categories', 'Video Game Consoles'): ('Computer Platforms', 'Video Games'), ('Industrial Electronics Categories', 'Automation'): ('Industrial', 'Auto Powertrain'), ('Industrial Electronics Categories', 'Building & Home Control'): ('Industrial', 'Manufacturing Equipment'), ('Industrial Electronics Categories', 'Lighting'): ('Industrial', 'Manufacturing Equipment'), ('Industrial Electronics Categories', 'Medical Electronics'): ('Industrial', 'Media Tablets'), ('Industrial Electronics Categories', 'Military & Civil Aerospace'): ('Industrial', 'Military/Aerospace'), ('Industrial Electronics Categories', 'Other Industrial'): ('Industrial', 'Other Industrial'), ('Industrial Electronics Categories', 'Power & Energy'): ('Industrial', 'Power & Energy'), ('Industrial Electronics Categories', 'Security & Video Surveillance'): ('Industrial', 'Test & Measurement'), ('Industrial Electronics Categories', 'Test & Measurement'): ('Industrial', 'Test & Measurement'), ('Wired Communications Categories', 'Broadcast & Streaming Video'): ('Wired Communications', 'Test & Measurement'), ('Wired Communications Categories', 'Cable Aggregation Equipment'): ('Wired Communications', 'Manufacturing Equipment'), ('Wired Communications Categories', 'Cable CPE'): ('Wired Communications', 'Tablet PCs'), ('Wired Communications Categories', 'Carrier Ethernet Switches & Routers'): ('Wired Communications', 'Other Computer Products'), ('Wired Communications Categories', 'DSL Aggregation Equipment'): ('Wired Communications', 'Manufacturing Equipment'), ('Wired Communications Categories', 'DSL CPE'): ('Wired Communications', 'Desktop PCs'), ('Wired Communications Categories', 'Data Center Network Switches'): ('Wired Communications', 'Data Center Servers'), ('Wired Communications Categories', 'Enterprise Ethernet Switches & Routers'): ('Wired Communications', 'Enterprise/SOHO'), ('Wired Communications Categories', 'Enterprise UC & Voice'): ('Wired Communications', 'Enterprise/SOHO'), ('Wired Communications Categories', 'FTTH Aggregation Equipment'): ('Wired Communications', 'Manufacturing Equipment'), ('Wired Communications Categories', 'FTTH CPE'): ('Wired Communications', 'Tablet PCs'), ('Wired Communications Categories', 'Low-Tier Consumer/SOHO Routers'): ('Wired Communications', 'Other Consumer'), ('Wired Communications Categories', 'Optical Equipment'): ('Wired Communications', 'Manufacturing Equipment'), ('Wired Communications Categories', 'Other Wired Communications'): ('Wired Communications', 'Other Wired'), ('Wired Communications Categories', 'Threat Mitigation Products'): ('Wired Communications', 'DC Network & Threat Mitigation'), ('Wireless Communications Categories', 'Gray Market Handsets'): ('Wireless Communications', 'Handset'), ('Wireless Communications Categories', 'High-Tier Smartphone'): ('Wireless Communications', 'Other Peripherals'), ('Wireless Communications Categories', 'Low-Tier Smartphone'): ('Wireless Communications', 'Power & Energy'), ('Wireless Communications Categories', 'M2M Modules'): ('Wireless Communications', 'Media Tablets'), ('Wireless Communications Categories', 'Media Tablets'): ('Wireless Communications', 'Media Tablets'), ('Wireless Communications Categories', 'Mid-Tier Smartphone'): ('Wireless Communications', 'Video Games'), ('Wireless Communications Categories', 'Mobile Comm Infrastructure'): ('Wireless Communications', 'Infrastructure'), ('Wireless Communications Categories', 'Mobile Phone (ULCH, Entry, Feature)'): ('Wireless Communications', 'Military/Aerospace'), ('Wireless Communications Categories', 'Other Wireless Communications'): ('Wireless Communications', 'Other Wireless'), ('Wireless Communications Categories', 'Wireless LAN Equipment'): ('Wireless Communications', 'Manufacturing Equipment')}

In [7]:
#Nb of keys
print(len(dict_map.keys()))

#Rename the market and submarket

def remapping(market, submarket):
    key = (market, submarket)
    return mapping_dict[key][0], mapping_dict[key][1]

market_spend.Market, market_spend.Submarket = zip(*market_spend.apply(lambda x : remapping(x.Market, x.Submarket), axis = 1))
market_spend.rename(columns = {"Submarket" : "SubMarket"}, inplace = True)
market_spend

65


Unnamed: 0,Market,SubMarket,Device,Year,Region,Market size
0,Automotive,Auto ADAS,Amplifier/Comparator,2008,Americas,0.424
1,Automotive,Auto ADAS,Amplifier/Comparator,2008,Asia & Oceania (excl. Japan),0.246
2,Automotive,Auto ADAS,Amplifier/Comparator,2008,EMEA,0.678
3,Automotive,Auto ADAS,Amplifier/Comparator,2008,Japan,0.421
4,Automotive,Auto ADAS,Amplifier/Comparator,2008,Worldwide,1.769
...,...,...,...,...,...,...
32495,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2027,Americas,0.000
32496,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2027,Asia & Oceania (excl. Japan),0.000
32497,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2027,EMEA,0.000
32498,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2027,Japan,0.000


In [8]:
market_spend = market_spend[(market_spend.Region == "Worldwide")]
market_spend.drop("Region", axis = 1, inplace = True)
market_spend

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  market_spend.drop("Region", axis = 1, inplace = True)


Unnamed: 0,Market,SubMarket,Device,Year,Market size
4,Automotive,Auto ADAS,Amplifier/Comparator,2008,1.769
9,Automotive,Auto ADAS,Amplifier/Comparator,2009,1.390
14,Automotive,Auto ADAS,Amplifier/Comparator,2010,3.401
19,Automotive,Auto ADAS,Amplifier/Comparator,2011,5.911
24,Automotive,Auto ADAS,Amplifier/Comparator,2012,7.250
...,...,...,...,...,...
32479,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2023,0.000
32484,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2024,0.000
32489,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2025,0.000
32494,Wireless Communications,Infrastructure,Voltage Regulator/Reference,2026,0.000


In [10]:
marketspendings = market_spend[["Market", "Year", "Market size"]].groupby(["Market", "Year"]).sum().reset_index()
submarketspendings = market_spend[["SubMarket", "Year", "Market size"]].groupby(["SubMarket", "Year"]).sum().reset_index()
devicespendings = market_spend[["Device", "Year", "Market size"]].groupby(["Device", "Year"]).sum().reset_index()

In [17]:
merged_dataset = cleaned_data.merge(market_spend, how = 'left', on = ['Market', "SubMarket", "Device", "Year"])
merged_dataset = merged_dataset.groupby(["CompanyName", "Device", "Market", "SubMarket", "Year", "Value"]).sum().reset_index()
merged_dataset.rename(columns = {"Market size" : "Spendings"}, inplace = True)
merged_dataset = merged_dataset[merged_dataset['Spendings'] != 0]
merged_dataset

Unnamed: 0,CompanyName,Device,Market,SubMarket,Year,Value,Spendings
7,ABB,Amplifier/Comparator,Industrial,Power & Energy,2018,7.466000,200.530
8,ABB,Amplifier/Comparator,Industrial,Power & Energy,2019,8.471900,189.490
9,ABB,Amplifier/Comparator,Industrial,Power & Energy,2020,8.661150,195.081
10,ABB,Amplifier/Comparator,Industrial,Power & Energy,2021,10.090255,240.245
11,ABB,Amplifier/Comparator,Industrial,Power & Energy,2022,10.040442,257.635
...,...,...,...,...,...,...,...
36010,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2020,10.295800,282.999
36011,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2021,17.639800,399.001
36012,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2022,23.717500,469.001
36013,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2023,23.067700,412.999


In [18]:
merged_dataset = merged_dataset.merge(marketspendings, how = 'left', on = ["Market", "Year"])
merged_dataset.rename(columns = {"Market size" : "MarketSpends"}, inplace = True)
merged_dataset = merged_dataset.merge(submarketspendings, how = 'left', on = ["SubMarket", "Year"])
merged_dataset.rename(columns = {"Market size" : "SubMarketSpends"}, inplace = True)
merged_dataset = merged_dataset.merge(devicespendings, how = 'left', on = ["Device", "Year"])
merged_dataset.rename(columns = {"Market size" : "DevSpends"}, inplace = True)
merged_dataset

Unnamed: 0,CompanyName,Device,Market,SubMarket,Year,Value,Spendings,MarketSpends,SubMarketSpends,DevSpends
0,ABB,Amplifier/Comparator,Industrial,Power & Energy,2018,7.466000,200.530,11486.855,2038.700,3990.996
1,ABB,Amplifier/Comparator,Industrial,Power & Energy,2019,8.471900,189.490,10750.418,1947.505,3800.001
2,ABB,Amplifier/Comparator,Industrial,Power & Energy,2020,8.661150,195.081,10966.167,2004.751,3786.004
3,ABB,Amplifier/Comparator,Industrial,Power & Energy,2021,10.090255,240.245,13599.835,2321.554,4785.001
4,ABB,Amplifier/Comparator,Industrial,Power & Energy,2022,10.040442,257.635,15244.412,2525.705,5469.003
...,...,...,...,...,...,...,...,...,...,...
31735,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2020,10.295800,282.999,5679.186,558.000,11956.010
31736,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2021,17.639800,399.001,7289.565,722.000,15765.008
31737,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2022,23.717500,469.001,7755.155,887.001,17307.004
31738,iRobot,Voltage Regulator/Reference,Consumer,Appliance,2023,23.067700,412.999,6292.004,674.000,16419.001


## Addition of the Financial Data

In [71]:
merged_dataset.rename(columns = {"CompanyName" : "Company"}, inplace = True)

In [72]:
comp_map = pd.read_csv("../input_data/financials_map.csv", delimiter= ",", encoding='unicode_escape', index_col = 0)
comp_map
comp_map = comp_map.set_index("Company Tag from S&P Global").to_dict()["Company Name from Spend Tracker"]
companies_available = comp_map.keys()

In [73]:
def mapping_companies(x):
    if x in companies_available:
        return comp_map[x]
    else :
        return None

In [146]:
ebitda = pd.read_csv("../clean_data/ebitda_complete.csv")
ebitda.Company = ebitda.Company.apply(mapping_companies)
ebitda = ebitda[~ebitda['Company'].isnull()]
ebitda["Ebitda_1"] = ebitda.groupby('Company')['Ebitda'].shift(1)
ebitda["Ebitda_2"] = ebitda.groupby('Company')['Ebitda'].shift(2)
remove_ebitda = ebitda[(ebitda.Ebitda_2 < 0) | (ebitda.Ebitda < 0) | (ebitda.Ebitda_1 < 0)].Company.unique()

In [145]:
cogs = pd.read_csv("../clean_data/cogs_complete.csv")
cogs.Company = cogs.Company.apply(mapping_companies)
cogs = cogs[~cogs['Company'].isnull()]
cogs.rename(columns = {"Value" : "Cogs"}, inplace = True)
cogs["Cogs_1"] = cogs.groupby('Company')['Cogs'].shift(1)
cogs["Cogs_2"] = cogs.groupby('Company')['Cogs'].shift(2)
remove_cogs = cogs[(cogs.Cogs_2 < 0) | (cogs.Cogs < 0) | (cogs.Cogs_1 < 0)].Company.unique()
remove_cogs

array(['Wabtec', 'Hewlett-Packard', 'Johnson Controls', 'Schlumberger',
       'LG Display', 'Nabtesco Corporation'], dtype=object)

In [147]:
revenue = pd.read_csv("../clean_data/revenue_complete.csv")
revenue.Company = revenue.Company.apply(mapping_companies)
revenue = revenue[~revenue['Company'].isnull()]
revenue["Revenue_1"] = revenue.groupby('Company')['Revenue'].shift(1)
revenue["Revenue_2"] = revenue.groupby('Company')['Revenue'].shift(2)
remove_revenue = revenue[(revenue.Revenue_2 < 0) | (revenue.Revenue < 0) | (revenue.Revenue_1 < 0)].Company.unique()
revenue

Unnamed: 0,Company,Year,Revenue,Revenue_1,Revenue_2
0,ABB,2016,24929.00,,
1,ABB,2017,25196.00,24929.00,
2,ABB,2018,27662.00,25196.00,24929.00
3,ABB,2019,27978.00,27662.00,25196.00
4,ABB,2020,26134.00,27978.00,27662.00
...,...,...,...,...,...
2539,Transsion,2023,8270.25,6371.40,6756.40
2540,Transsion,2024,9718.26,8270.25,6371.40
2541,Transsion,2025,11490.58,9718.26,8270.25
2542,Transsion,2026,10273.06,11490.58,9718.26


In [69]:
revenue[(revenue.Revenue < 0)]

Unnamed: 0,Company,Year,Revenue,Revenue_1,Revenue_2
779,Deere & Company,2027,-6696.428571,13299.48,57084.85
1241,Rolls-Royce Holding,2021,-417.7,13952.7,20140.4
2170,LG Display,2026,-971.485714,20149.21,18165.62
2171,LG Display,2027,-1309.957143,-971.485714,20149.21
2227,TomTom,2023,-59.14,569.1,537.9
2254,HTC,2026,-2.35,398.09,254.37


In [150]:
final_df = merged_dataset.merge(cogs, how = "left", on = ["Company", "Year"]).merge(revenue, how = "left", on = ["Company", "Year"]).merge(ebitda, how = "left", on = ["Company", "Year"])
final_df = final_df[~final_df['Cogs'].isnull()].reset_index(drop = True)
final_df = final_df[~final_df['Ebitda'].isnull()].reset_index(drop = True)
final_df = final_df[~final_df['Ebitda_1'].isnull()].reset_index(drop = True)
final_df = final_df[~final_df['Ebitda_2'].isnull()].reset_index(drop = True)
final_df = final_df[final_df['Value'] != 0]
final_df = final_df[~final_df.Company.isin(remove_ebitda)]
final_df = final_df[~final_df.Company.isin(remove_cogs)]
final_df = final_df[~final_df.Company.isin(remove_revenue)]
final_df

Unnamed: 0,Company,Device,Market,SubMarket,Year,Value,Spendings,MarketSpends,SubMarketSpends,DevSpends,Cogs,Cogs_1,Cogs_2,Revenue,Revenue_1,Revenue_2,Ebitda,Ebitda_1,Ebitda_2
0,ABB,Amplifier/Comparator,Industrial,Power & Energy,2018,7.466000,200.530,11486.855,2038.700,3990.996,19059.000000,17278.000000,17270.0,27662.00,25196.00,24929.0,3227.000000,2929.000000,2987.0
1,ABB,Amplifier/Comparator,Industrial,Power & Energy,2019,8.471900,189.490,10750.418,1947.505,3800.001,19018.000000,19059.000000,17278.0,27978.00,27662.00,25196.0,3347.000000,3227.000000,2929.0
2,ABB,Amplifier/Comparator,Industrial,Power & Energy,2020,8.661150,195.081,10966.167,2004.751,3786.004,18123.000000,19018.000000,19059.0,26134.00,27978.00,27662.0,2668.000000,3347.000000,3227.0
3,ABB,Amplifier/Comparator,Industrial,Power & Energy,2021,10.090255,240.245,13599.835,2321.554,4785.001,19407.000000,18123.000000,19018.0,28945.00,26134.00,27978.0,4641.000000,2668.000000,3347.0
4,ABB,Amplifier/Comparator,Industrial,Power & Energy,2022,10.040442,257.635,15244.412,2525.705,5469.003,19712.000000,19407.000000,18123.0,29446.00,28945.00,26134.0,4477.000000,4641.000000,2668.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25911,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2020,0.809700,454.000,25838.124,4852.175,11956.010,2445.000000,2385.000000,2237.0,4448.00,4485.00,4218.0,831.000000,899.000000,817.0
25912,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2021,1.271038,584.000,33129.151,5885.459,15765.008,2999.000000,2445.000000,2385.0,5627.00,4448.00,4485.0,1198.000000,831.000000,899.0
25913,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2022,1.229600,555.000,37356.594,5729.355,17307.004,3157.000000,2999.000000,2445.0,5781.00,5627.00,4448.0,1140.000000,1198.000000,831.0
25914,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2023,1.182734,474.999,28959.587,4701.997,16419.001,7385.362857,3157.000000,2999.0,4536.76,5781.00,5627.0,3668.885714,1140.000000,1198.0


In [151]:
final_df.to_csv("../clean_data/final_df.csv")

## New Target

In [152]:
def log_diff(x, y):
    return np.log(1+x) - np.log(1+y)

In [153]:
new_df = final_df.copy()
new_df["Value_1"] = new_df.groupby(["Company", "Device", "Market", "SubMarket"]).Value.shift(1)
new_df["Spendings_1"] = new_df.groupby(["Company", "Device", "Market", "SubMarket"]).Spendings.shift(1)
new_df["MarketSpends_1"] = new_df.groupby(["Company", "Device", "Market", "SubMarket"]).MarketSpends.shift(1)
new_df["SubMarketSpends_1"] = new_df.groupby(["Company", "Device", "Market", "SubMarket"]).SubMarketSpends.shift(1)
new_df["DevSpends_1"] = new_df.groupby(["Company", "Device", "Market", "SubMarket"]).DevSpends.shift(1)
new_df = new_df[~new_df['Value_1'].isnull()].reset_index(drop = True)

new_df["Target"] = new_df.apply(lambda x: log_diff(x.Value, x.Value_1), axis = 1)
new_df["LogDiffSpendings"] = new_df.apply(lambda x: log_diff(x.Spendings, x.Spendings_1), axis = 1)
new_df["LogDiffMarketSpends"] = new_df.apply(lambda x: log_diff(x.MarketSpends, x.MarketSpends_1), axis = 1)
new_df["LogDiffSubMarketSpends"] = new_df.apply(lambda x: log_diff(x.SubMarketSpends, x.SubMarketSpends_1), axis = 1)
new_df["LogDiffDevSpends"] = new_df.apply(lambda x: log_diff(x.DevSpends, x.DevSpends_1), axis = 1)

new_df.drop(["Value", "Value_1", "Spendings", "Spendings_1", "MarketSpends", "MarketSpends_1", "SubMarketSpends", "SubMarketSpends_1", "DevSpends", "DevSpends_1"], axis = 1, inplace = True)

new_df["LogDiffCogs"] = new_df.apply(lambda x: log_diff(x.Cogs, x.Cogs_1), axis = 1)
new_df["LogDiffRevenue"] = new_df.apply(lambda x: log_diff(x.Revenue, x.Revenue_1), axis = 1)
new_df["LogDiffEbitda"] = new_df.apply(lambda x: log_diff(x.Ebitda, x.Ebitda_1), axis = 1)
new_df["LogDiffCogs_1"] = new_df.apply(lambda x: log_diff(x.Cogs_1, x.Cogs_2), axis = 1)
new_df["LogDiffRevenue_1"] = new_df.apply(lambda x: log_diff(x.Revenue_1, x.Revenue_2), axis = 1)
new_df["LogDiffEbitda_1"] = new_df.apply(lambda x: log_diff(x.Ebitda_1, x.Ebitda_2), axis = 1)

new_df.drop(["Cogs_1", "Cogs_2", "Cogs", "Revenue", "Revenue_1", "Revenue_2", "Ebitda", "Ebitda_1", "Ebitda_2"], axis = 1, inplace = True)
new_df

Unnamed: 0,Company,Device,Market,SubMarket,Year,Target,LogDiffSpendings,LogDiffMarketSpends,LogDiffSubMarketSpends,LogDiffDevSpends,LogDiffCogs,LogDiffRevenue,LogDiffEbitda,LogDiffCogs_1,LogDiffRevenue_1,LogDiffEbitda_1
0,ABB,Amplifier/Comparator,Industrial,Power & Energy,2019,0.112271,-0.056339,-0.066253,-0.045740,-0.049027,-0.002153,0.011358,0.036500,0.098100,0.093371,0.096860
1,ABB,Amplifier/Comparator,Industrial,Power & Energy,2020,0.019783,0.028928,0.019868,0.028956,-0.003689,-0.048201,-0.068179,-0.226659,-0.002153,0.011358,0.036500
2,ABB,Amplifier/Comparator,Industrial,Power & Energy,2021,0.137954,0.207285,0.215225,0.146649,0.234120,0.068448,0.102157,0.553441,-0.048201,-0.068179,-0.226659
3,ABB,Amplifier/Comparator,Industrial,Power & Energy,2022,-0.004502,0.069605,0.114147,0.084249,0.133584,0.015593,0.017160,-0.035969,0.068448,0.102157,0.553441
4,ABB,Amplifier/Comparator,Industrial,Power & Energy,2023,0.077302,0.123734,0.068025,0.129541,-0.063393,-0.690697,0.088614,0.260177,0.015593,0.017160,-0.035969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17261,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2020,0.058855,-0.004677,0.153879,0.049009,0.085060,0.024836,-0.008282,-0.078562,0.064036,0.061363,0.095532
17262,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2021,0.227076,0.251314,0.248554,0.193021,0.276539,0.204158,0.235075,0.365411,0.024836,-0.008282,-0.078562
17263,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2022,-0.018415,-0.050844,0.120092,-0.026877,0.093313,0.051327,0.026996,-0.049583,0.204158,0.235075,0.365411
17264,Zebra Technologies,Voltage Regulator/Reference,Wireless Communications,Other Wireless,2023,-0.021244,-0.155353,-0.254600,-0.197578,-0.052669,0.849697,-0.242316,1.168255,0.051327,0.026996,-0.049583


In [155]:
new_df.to_csv("../clean_data/newtarget_df.csv")