In [5]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144
import pandas as pd

### Problem Statement

1. Which company type has the highest market share, with the hierarchy

To do this:
- Find the total sum of the revenue of all the company type
- Find the sub total of each of the company type
- Divide the sub total of each company type by the total sum of the revenue to get the market share

Let's web-scrape the data from wikipedia using pd.read_html
- note: the pd.read_html scrapes all the table from the page

In [6]:
df_all = pd.read_html('https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue', header=0, skiprows=1)

Let's see the first five rows of the first table

In [7]:
df_all[0].head()

Unnamed: 0,Rank.,Name.,Industry under which the companies are,USD millions,USD millions.1,Employed members of the Company are:,Headquarters[note 1],State-owned,Ref.
0,1,Walmart,Retail,"$572,754","$13,673",2300000,United States,,[1]
1,2,"Amazon.com, Inc.",Retail,"$469,822","$33,364",1608000,United States,,[4]
2,3,State Grid Corporation of China,Electricity,"$460,616.9","$7,137.8",871145,China,,[5]
3,4,China National Petroleum Corporation,Oil and gas,"$411,692.9","$9,637.5",1090345,China,,[6]
4,5,China Petrochemical Corporation,Oil and gas,"$401,313.5","$8,316.1",542286,China,,[7]


In [61]:
#rename the first table df
df = df_all[0]

First let's rename the field names

In [10]:
#df.columns

In [11]:
df.rename(columns = {'Name.' : 'Company', 'Industry under which the companies are': 'Company_type',
                       'USD millions': 'Revenue','USD millions.1': 'Profit',
                             'Employed members of the Company are:': 'Employees_number',
                                   'Headquarters[note 1]': 'Headquaters'}, inplace=True)

In [12]:
df.head()

Unnamed: 0,Rank.,Company,Company_type,Revenue,Profit,Employees_number,Headquaters,State-owned,Ref.
0,1,Walmart,Retail,"$572,754","$13,673",2300000,United States,,[1]
1,2,"Amazon.com, Inc.",Retail,"$469,822","$33,364",1608000,United States,,[4]
2,3,State Grid Corporation of China,Electricity,"$460,616.9","$7,137.8",871145,China,,[5]
3,4,China National Petroleum Corporation,Oil and gas,"$411,692.9","$9,637.5",1090345,China,,[6]
4,5,China Petrochemical Corporation,Oil and gas,"$401,313.5","$8,316.1",542286,China,,[7]


Let's drop the columns we don't really need

In [62]:
df = df.drop(['Rank.', 'State-owned', 'Ref.'], axis=1)

Now check out the new data

In [63]:
df.head()

Unnamed: 0,Company,Company_type,Revenue,Profit,Employees_number,Headquaters
0,Walmart,Retail,"$572,754","$13,673",2300000,United States
1,"Amazon.com, Inc.",Retail,"$469,822","$33,364",1608000,United States
2,State Grid Corporation of China,Electricity,"$460,616.9","$7,137.8",871145,China
3,China National Petroleum Corporation,Oil and gas,"$411,692.9","$9,637.5",1090345,China
4,China Petrochemical Corporation,Oil and gas,"$401,313.5","$8,316.1",542286,China


This is quite better

Let's turn this to a JSON format

In [64]:
import json
company_type = json.loads(df.to_json(orient='records'))

Now let's check the first few data

In [65]:
company_type[:3]

[{'Company': 'Walmart',
  'Company_type': 'Retail',
  'Revenue': '$572,754',
  'Profit': '$13,673',
  'Employees_number': 2300000,
  'Headquaters': 'United States'},
 {'Company': 'Amazon.com, Inc.',
  'Company_type': 'Retail',
  'Revenue': '$469,822',
  'Profit': '$33,364',
  'Employees_number': 1608000,
  'Headquaters': 'United States'},
 {'Company': 'State Grid Corporation of China',
  'Company_type': 'Electricity',
  'Revenue': '$460,616.9',
  'Profit': '$7,137.8',
  'Employees_number': 871145,
  'Headquaters': 'China'}]

Could we check all the company types we have?

In [66]:
# get the number of companygroup
Number_of_company_type = len(set([data['Company_type'] for data in company_type]))

print(set([data['Company_type'] for data in company_type]))
print("--------")
print(f"We have {Number_of_company_type} company types")

{'Information technology', 'Electronics', 'Construction', 'Commodities', 'Chemicals', 'Automotive', 'Insurance', 'Oil and gas', 'Steel', 'Retail', 'Conglomerate', 'Telecommunications', 'Healthcare', 'Financials', 'Electricity'}
--------
We have 15 company types


Now, 1 - Let's find the total sum of all the revenues

We need to get rid of the dollar sign and replace the ',' with '_' to be able to convert the value to a float for total summation. Note, the _ sign is a thousand divide in Python.

In [67]:
total_rev = sum([float(data['Revenue'].strip('$').replace(',','_')) for data in company_type])
total_rev

11627790.100000001

Now that we have found the total revenue for all the 15 company types, let's proceed to find the sub-total for each of the type

In [68]:
# the types of companies we have again in alphabetical order
groups = set([data['Company_type'] for data in company_type])
groups

{'Automotive',
 'Chemicals',
 'Commodities',
 'Conglomerate',
 'Construction',
 'Electricity',
 'Electronics',
 'Financials',
 'Healthcare',
 'Information technology',
 'Insurance',
 'Oil and gas',
 'Retail',
 'Steel',
 'Telecommunications'}

In [124]:
def distinct_group(company_type, total_rev=total_rev):
    
    group_sum = {} # dict for company type and total by grooups
    group_sum_mkt_share = {} # dict for company type and market share
    
    for value in company_type:
        if value['Company_type'] not in group_sum:
            group_sum[value['Company_type']] = float(value['Revenue'].strip('$').replace(',','_'))
        else:
            group_sum[value['Company_type']] += float(value['Revenue'].strip('$').replace(',','_'))
    
    for key, val in group_sum.items():
        group_sum_mkt_share[key] = round(val / total_rev,2)
        
    return sorted(group_sum_mkt_share.items(), key = lambda x: x[1], reverse=True)

In [125]:
distinct_group(company_type)

[('Oil and gas', 0.21),
 ('Retail', 0.14),
 ('Healthcare', 0.13),
 ('Financials', 0.13),
 ('Automotive', 0.08),
 ('Electronics', 0.07),
 ('Construction', 0.05),
 ('Electricity', 0.04),
 ('Information technology', 0.04),
 ('Commodities', 0.04),
 ('Chemicals', 0.01),
 ('Telecommunications', 0.01),
 ('Insurance', 0.01),
 ('Conglomerate', 0.01),
 ('Steel', 0.01)]

From the result above, the Oil and Gas industry has the highest market share according to fortune Global 500 2022