In [1]:
import pandas as pd

# Step 1: Loading the dataset
df = pd.read_csv("fortune1000_2024.csv")

# first 5 rows
print("===== HEAD =====")
display(df.head())

#  column names 
print("\n===== COLUMNS =====")
print(df.columns.tolist())

#  basic info
print("\n===== INFO =====")
df.info()
df.shape


===== HEAD =====


Unnamed: 0,Rank,Company,Ticker,Sector,Industry,Profitable,Founder_is_CEO,FemaleCEO,Growth_in_Jobs,Change_in_Rank,...,Assets_M,CEO,Country,HeadquartersCity,HeadquartersState,Website,CompanyType,Footnote,MarketCap_Updated_M,Updated
0,1,Walmart,WMT,Retailing,General Merchandisers,yes,no,no,no,0.0,...,252399.0,C. Douglas McMillon,U.S.,Bentonville,Arkansas,https://www.stock.walmart.com,Public,"Figures are for fiscal year ended Jan. 31, 202...",559911.0,2024-08-05
1,2,Amazon,AMZN,Retailing,Internet Services and Retailing,yes,no,no,no,0.0,...,527854.0,Andrew R. Jassy,U.S.,Seattle,Washington,https://www.amazon.com,Public,"Market value as of July 15, 2024.",2005565.0,2024-08-05
2,3,Apple,AAPL,Technology,"Computers, Office Equipment",yes,no,no,no,1.0,...,352583.0,Timothy D. Cook,U.S.,Cupertino,California,https://www.apple.com,Public,"Figures are for fiscal year ended Sept. 30, 20...",3594309.0,2024-08-05
3,4,UnitedHealth Group,UNH,Health Care,Health Care: Insurance and Managed Care,yes,no,no,yes,1.0,...,273720.0,Andrew P. Witty,U.S.,Minnetonka,Minnesota,https://www.unitedhealthgroup.com,Public,"Market value as of July 15, 2024.",474339.0,2024-08-05
4,5,Berkshire Hathaway,BRKA,Financials,Insurance: Property and Casualty (Stock),yes,no,no,yes,2.0,...,1069978.0,Warren E. Buffett,U.S.,Omaha,Nebraska,https://www.berkshirehathaway.com,Public,"Market value as of July 15, 2024.",937028.0,2024-08-05



===== COLUMNS =====
['Rank', 'Company', 'Ticker', 'Sector', 'Industry', 'Profitable', 'Founder_is_CEO', 'FemaleCEO', 'Growth_in_Jobs', 'Change_in_Rank', 'Gained_in_Rank', 'Dropped_in_Rank', 'Newcomer_to_the_Fortune500', 'Global500', 'Worlds_Most_Admired_Companies', 'Best_Companies_to_Work_For', 'Number_of_employees', 'MarketCap_March28_M', 'Revenues_M', 'RevenuePercentChange', 'Profits_M', 'ProfitsPercentChange', 'Assets_M', 'CEO', 'Country', 'HeadquartersCity', 'HeadquartersState', 'Website', 'CompanyType', 'Footnote', 'MarketCap_Updated_M', 'Updated']

===== INFO =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Rank                           1000 non-null   int64  
 1   Company                        1000 non-null   object 
 2   Ticker                         959 non-null    object 
 3   Sector    

(1000, 32)

In [2]:
# STEP 2 — CLEAN BOOLEAN + TEXT COLUMNS

#  Standardizing lowercase + remove extra spaces
bool_cols = ["Profitable", "Founder_is_CEO", "FemaleCEO",
             "Gained_in_Rank", "Dropped_in_Rank",
             "Newcomer_to_the_Fortune500"]

for col in bool_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()

#  Converting yes/no true/false into 1/0
binary_map = {
    "yes": 1,
    "no": 0,
    "true": 1,
    "false": 0,
    "nan": None,     # Let missing stay missing for now
    "": None
}

for col in bool_cols:
    df[col] = df[col].map(binary_map)

# Cleaning 'Growth_in_Jobs'
# Converting numbers → numeric
# Convert 'no' → 0
df["Growth_in_Jobs"] = df["Growth_in_Jobs"].str.strip().str.lower()
df["Growth_in_Jobs"] = pd.to_numeric(df["Growth_in_Jobs"], errors="coerce").fillna(0)

#⃣ Cleaning 'Country'
df["Country"] = df["Country"].str.strip().str.upper()

In [3]:
df[["Profitable", "Founder_is_CEO", "FemaleCEO", "Growth_in_Jobs"]].head(10)



Unnamed: 0,Profitable,Founder_is_CEO,FemaleCEO,Growth_in_Jobs
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
5,1.0,0.0,1.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0


In [4]:
df["Country"].unique()[:20]


array(['U.S.'], dtype=object)

In [5]:
# STEP 3 — CLEANING IMPORTANT TEXT COLUMNS

text_cols = ["Company", "Sector", "Industry", "CEO",
             "HeadquartersCity", "HeadquartersState",
             "Website", "CompanyType"]

for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.title()


In [6]:
df[["Company", "Sector", "Industry", "CEO"]].head(10)


Unnamed: 0,Company,Sector,Industry,CEO
0,Walmart,Retailing,General Merchandisers,C. Douglas Mcmillon
1,Amazon,Retailing,Internet Services And Retailing,Andrew R. Jassy
2,Apple,Technology,"Computers, Office Equipment",Timothy D. Cook
3,Unitedhealth Group,Health Care,Health Care: Insurance And Managed Care,Andrew P. Witty
4,Berkshire Hathaway,Financials,Insurance: Property And Casualty (Stock),Warren E. Buffett
5,Cvs Health,Health Care,Health Care: Pharmacy And Other Services,Karen S. Lynch
6,Exxon Mobil,Energy,Petroleum Refining,Darren W. Woods
7,Alphabet,Technology,Internet Services And Retailing,Sundar Pichai
8,Mckesson,Health Care,Wholesalers: Health Care,Brian S. Tyler
9,Cencora,Health Care,Wholesalers: Health Care,Steven H. Collis


In [7]:
# STEP 4 — REMOVEING DUPLICATES
df = df.drop_duplicates(subset=["Company"])

# STEP 4.1 — CLEANING NUMERIC COLUMNS

numeric_cols = [
    "Rank", "Change_in_Rank", "Number_of_employees",
    "MarketCap_March28_M", "Revenues_M",
    "RevenuePercentChange", "Profits_M",
    "ProfitsPercentChange", "Assets_M",
    "MarketCap_Updated_M"
]

# Converting to numeric 
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [8]:
#  summary to verify cleaning
df[numeric_cols].head(10)

Unnamed: 0,Rank,Change_in_Rank,Number_of_employees,MarketCap_March28_M,Revenues_M,RevenuePercentChange,Profits_M,ProfitsPercentChange,Assets_M,MarketCap_Updated_M
0,1,0.0,2100000,484852.8,648125.0,6.0,15511.0,32.8,252399.0,559911.0
1,2,0.0,1525000,1873675.8,574785.0,11.8,30425.0,,527854.0,2005565.0
2,3,1.0,161000,2647973.8,383285.0,-2.8,96995.0,-2.8,352583.0,3594309.0
3,4,1.0,440000,456080.8,371622.0,14.6,22381.0,11.2,273720.0,474339.0
4,5,2.0,396500,908919.7,364482.0,20.7,96223.0,,1069978.0,937028.0
5,6,0.0,259500,100373.9,357776.0,10.9,8344.0,101.1,249728.0,73653.0
6,7,-4.0,61500,461222.2,344582.0,-16.7,36010.0,-35.4,376317.0,516824.0
7,8,0.0,182502,1884633.0,307394.0,8.7,73795.0,23.0,402392.0,2315182.0
8,9,0.0,48000,70546.5,276711.0,4.8,3560.0,219.6,62320.0,75007.0
9,10,1.0,44000,48472.1,262173.4,9.9,1745.3,2.7,62558.7,43758.0


In [9]:
# STEP 5 — HANDLEING MISSING NUMERIC VALUES

fill_zero_cols = [
    "MarketCap_March28_M",
    "Profits_M",
    "ProfitsPercentChange",
    "RevenuePercentChange",
    "MarketCap_Updated_M"
]

# Filling all missing numeric values with 0
df[fill_zero_cols] = df[fill_zero_cols].fillna(0)

# cleaned columns
df[fill_zero_cols].head(10)


Unnamed: 0,MarketCap_March28_M,Profits_M,ProfitsPercentChange,RevenuePercentChange,MarketCap_Updated_M
0,484852.8,15511.0,32.8,6.0,559911.0
1,1873675.8,30425.0,0.0,11.8,2005565.0
2,2647973.8,96995.0,-2.8,-2.8,3594309.0
3,456080.8,22381.0,11.2,14.6,474339.0
4,908919.7,96223.0,0.0,20.7,937028.0
5,100373.9,8344.0,101.1,10.9,73653.0
6,461222.2,36010.0,-35.4,-16.7,516824.0
7,1884633.0,73795.0,23.0,8.7,2315182.0
8,70546.5,3560.0,219.6,4.8,75007.0
9,48472.1,1745.3,2.7,9.9,43758.0


In [10]:
# STEP 6 — TRANSFORMATION

# 1️ Rank Score (Reverse ranking: 1 becomes highest score)
df["Rank_Score"] = df["Rank"].max() + 1 - df["Rank"]

# 2️ Converting  Assets and Market Cap to Billions
df["Assets_B"] = df["Assets_M"] / 1000
df["MarketCap_B"] = df["MarketCap_Updated_M"] / 1000

#  results
df[["Rank", "Rank_Score", "Assets_M", "Assets_B", "MarketCap_Updated_M", "MarketCap_B"]].head(10)


Unnamed: 0,Rank,Rank_Score,Assets_M,Assets_B,MarketCap_Updated_M,MarketCap_B
0,1,1000,252399.0,252.399,559911.0,559.911
1,2,999,527854.0,527.854,2005565.0,2005.565
2,3,998,352583.0,352.583,3594309.0,3594.309
3,4,997,273720.0,273.72,474339.0,474.339
4,5,996,1069978.0,1069.978,937028.0,937.028
5,6,995,249728.0,249.728,73653.0,73.653
6,7,994,376317.0,376.317,516824.0,516.824
7,8,993,402392.0,402.392,2315182.0,2315.182
8,9,992,62320.0,62.32,75007.0,75.007
9,10,991,62558.7,62.5587,43758.0,43.758


In [11]:
# STEP 7 — NORMALIZATION

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df["Assets_norm"] = scaler.fit_transform(df[["Assets_M"]])
df["MarketCap_norm"] = scaler.fit_transform(df[["MarketCap_Updated_M"]])
df["Rank_norm"] = scaler.fit_transform(df[["Rank"]])

# Show normalized values
df[[
    "Assets_M", "Assets_norm",
    "MarketCap_Updated_M", "MarketCap_norm",
    "Rank", "Rank_norm"
]].head(10)


Unnamed: 0,Assets_M,Assets_norm,MarketCap_Updated_M,MarketCap_norm,Rank,Rank_norm
0,252399.0,0.058274,559911.0,0.155777,1,0.0
1,527854.0,0.121961,2005565.0,0.557983,2,0.001001
2,352583.0,0.081437,3594309.0,1.0,3,0.002002
3,273720.0,0.063203,474339.0,0.131969,4,0.003003
4,1069978.0,0.247306,937028.0,0.260698,5,0.004004
5,249728.0,0.057656,73653.0,0.020492,6,0.005005
6,376317.0,0.086925,516824.0,0.14379,7,0.006006
7,402392.0,0.092953,2315182.0,0.644124,8,0.007007
8,62320.0,0.014325,75007.0,0.020868,9,0.008008
9,62558.7,0.014381,43758.0,0.012174,10,0.009009


In [12]:
# STEP 8.2 — COMPANY SIZE CLASSIFICATION

def size_class(assets):
    if assets > 200000:
        return "Mega"
    elif assets > 50000:
        return "Large"
    elif assets > 10000:
        return "Medium"
    else:
        return "Small"

df["Company_Size"] = df["Assets_M"].apply(size_class)

df[["Company", "Assets_M", "Company_Size"]].head(10)


Unnamed: 0,Company,Assets_M,Company_Size
0,Walmart,252399.0,Mega
1,Amazon,527854.0,Mega
2,Apple,352583.0,Mega
3,Unitedhealth Group,273720.0,Mega
4,Berkshire Hathaway,1069978.0,Mega
5,Cvs Health,249728.0,Mega
6,Exxon Mobil,376317.0,Mega
7,Alphabet,402392.0,Mega
8,Mckesson,62320.0,Large
9,Cencora,62558.7,Large


In [13]:
# STEP 9 — FINAL QUALITY CHECK

print("Missing values per column:")
print(df.isnull().sum())

print("\nData types:")
print(df.dtypes)

print("\nSample enriched data:")
df.head(10)


Missing values per column:
Rank                               0
Company                            0
Ticker                            41
Sector                             0
Industry                           0
Profitable                         0
Founder_is_CEO                     0
FemaleCEO                          0
Growth_in_Jobs                     0
Change_in_Rank                     0
Gained_in_Rank                     0
Dropped_in_Rank                    0
Newcomer_to_the_Fortune500       500
Global500                          0
Worlds_Most_Admired_Companies      0
Best_Companies_to_Work_For         0
Number_of_employees                0
MarketCap_March28_M                0
Revenues_M                         0
RevenuePercentChange               0
Profits_M                          0
ProfitsPercentChange               0
Assets_M                           0
CEO                                0
Country                            0
HeadquartersCity                   0
Headquarter

Unnamed: 0,Rank,Company,Ticker,Sector,Industry,Profitable,Founder_is_CEO,FemaleCEO,Growth_in_Jobs,Change_in_Rank,...,Footnote,MarketCap_Updated_M,Updated,Rank_Score,Assets_B,MarketCap_B,Assets_norm,MarketCap_norm,Rank_norm,Company_Size
0,1,Walmart,WMT,Retailing,General Merchandisers,1.0,0.0,0.0,0.0,0.0,...,"Figures are for fiscal year ended Jan. 31, 202...",559911.0,2024-08-05,1000,252.399,559.911,0.058274,0.155777,0.0,Mega
1,2,Amazon,AMZN,Retailing,Internet Services And Retailing,1.0,0.0,0.0,0.0,0.0,...,"Market value as of July 15, 2024.",2005565.0,2024-08-05,999,527.854,2005.565,0.121961,0.557983,0.001001,Mega
2,3,Apple,AAPL,Technology,"Computers, Office Equipment",1.0,0.0,0.0,0.0,1.0,...,"Figures are for fiscal year ended Sept. 30, 20...",3594309.0,2024-08-05,998,352.583,3594.309,0.081437,1.0,0.002002,Mega
3,4,Unitedhealth Group,UNH,Health Care,Health Care: Insurance And Managed Care,1.0,0.0,0.0,0.0,1.0,...,"Market value as of July 15, 2024.",474339.0,2024-08-05,997,273.72,474.339,0.063203,0.131969,0.003003,Mega
4,5,Berkshire Hathaway,BRKA,Financials,Insurance: Property And Casualty (Stock),1.0,0.0,0.0,0.0,2.0,...,"Market value as of July 15, 2024.",937028.0,2024-08-05,996,1069.978,937.028,0.247306,0.260698,0.004004,Mega
5,6,Cvs Health,CVS,Health Care,Health Care: Pharmacy And Other Services,1.0,0.0,1.0,0.0,0.0,...,"Market value as of July 15, 2024.",73653.0,2024-08-05,995,249.728,73.653,0.057656,0.020492,0.005005,Mega
6,7,Exxon Mobil,XOM,Energy,Petroleum Refining,1.0,0.0,0.0,0.0,-4.0,...,Excise taxes have been deducted. Market value ...,516824.0,2024-08-05,994,376.317,516.824,0.086925,0.14379,0.006006,Mega
7,8,Alphabet,GOOGL,Technology,Internet Services And Retailing,1.0,0.0,0.0,0.0,0.0,...,"Market value as of July 15, 2024.",2315182.0,2024-08-05,993,402.392,2315.182,0.092953,0.644124,0.007007,Mega
8,9,Mckesson,MCK,Health Care,Wholesalers: Health Care,1.0,0.0,0.0,0.0,0.0,...,"Figures are for fiscal year ended March 31, 20...",75007.0,2024-08-05,992,62.32,75.007,0.014325,0.020868,0.008008,Large
9,10,Cencora,COR,Health Care,Wholesalers: Health Care,1.0,0.0,0.0,0.0,1.0,...,"Figures are for fiscal year ended Sept. 30, 20...",43758.0,2024-08-05,991,62.5587,43.758,0.014381,0.012174,0.009009,Large


In [15]:
df.to_csv("fortune1000_cleaned.csv", index=False)
print("Saved as fortune1000_cleaned2.csv")

Saved as fortune1000_cleaned2.csv
