In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [2]:
data = pd.read_csv('Fortune_1000.csv')

In [3]:
data

Unnamed: 0,company,rank,rank_change,revenue,profit,num. of employees,sector,city,state,newcomer,ceo_founder,ceo_woman,profitable,prev_rank,CEO,Website,Ticker,Market Cap
0,Walmart,1,0.0,572754.0,13673.0,2300000.0,Retailing,Bentonville,AR,no,no,no,yes,1.0,C. Douglas McMillon,https://www.stock.walmart.com,WMT,352037
1,Amazon,2,0.0,469822.0,33364.0,1608000.0,Retailing,Seattle,WA,no,no,no,yes,2.0,Andrew R. Jassy,www.amazon.com,AMZN,1202717
2,Apple,3,0.0,365817.0,94680.0,154000.0,Technology,Cupertino,CA,no,no,no,yes,3.0,Timothy D. Cook,www.apple.com,AAPL,2443962
3,CVS Health,4,0.0,292111.0,7910.0,258000.0,Health Care,Woonsocket,RI,no,no,yes,yes,4.0,Karen Lynch,https://www.cvshealth.com,CVS,125204
4,UnitedHealth Group,5,0.0,287597.0,17285.0,350000.0,Health Care,Minnetonka,MN,no,no,no,yes,5.0,Andrew P. Witty,www.unitedhealthgroup.com,UNH,500468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Vizio Holding,996,0.0,2124.0,-39.4,800.0,Industrials,Irvine,CA,no,yes,no,no,,William W. Wang,https://www.vizio.com,VZIO,1705.1
996,1-800-Flowers.com,997,0.0,2122.2,118.7,4800.0,Retailing,Jericho,NY,no,no,no,yes,,Christopher G. McCann,https://www.1800flowers.com,FLWS,830
997,Cowen,998,0.0,2112.8,295.6,1534.0,Financials,New York,NY,no,no,no,yes,,Jeffrey Solomon,https://www.cowen.com,COWN,1078
998,Ashland,999,0.0,2111.0,220.0,4100.0,Chemicals,Wilmington,DE,no,no,no,yes,,Guillermo Novo,https://www.ashland.com,ASH,5601.9


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company            1000 non-null   object 
 1   rank               1000 non-null   int64  
 2   rank_change        1000 non-null   float64
 3   revenue            1000 non-null   float64
 4   profit             997 non-null    float64
 5   num. of employees  999 non-null    float64
 6   sector             1000 non-null   object 
 7   city               1000 non-null   object 
 8   state              1000 non-null   object 
 9   newcomer           1000 non-null   object 
 10  ceo_founder        1000 non-null   object 
 11  ceo_woman          1000 non-null   object 
 12  profitable         1000 non-null   object 
 13  prev_rank          1000 non-null   object 
 14  CEO                1000 non-null   object 
 15  Website            1000 non-null   object 
 16  Ticker             951 no

In [3]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop unused columns
    df = df.drop(['rank', 'rank_change', 'company', 'newcomer', 'prev_rank', 'CEO', 'Website', 'Ticker'], axis=1)
    
    # Encode missing values
    df['Market Cap'] = df['Market Cap'].replace('-', np.NaN).astype(np.float)
    
    # Drop missing target rows
    missing_target_rows = df[df['Market Cap'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Fill remaining missing values
    df['profit'] = df['profit'].fillna(df['profit'].mean())
    
    # Binary encoding
    for column in ['ceo_founder', 'ceo_woman', 'profitable']:
        df[column] = df[column].replace({'no': 0, 'yes': 1})
    
    # One-hot encoding
    for column in ['sector', 'city', 'state']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Split df into X and y
    y = df['Market Cap']
    X = df.drop('Market Cap', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df['Market Cap'] = df['Market Cap'].replace('-', np.NaN).astype(np.float)


Unnamed: 0,revenue,profit,num. of employees,ceo_founder,ceo_woman,profitable,sector_Aerospace & Defense,sector_Apparel,sector_Business Services,sector_Chemicals,...,state_PA,state_PR,state_RI,state_SC,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI
733,-0.344776,-0.325398,-0.272494,-0.237915,-0.301511,0.390716,-0.11651,-0.140452,-0.234404,5.435573,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,-0.332782,-0.077382,-0.179605,-0.122905,-0.13484
719,-0.342639,-0.353432,-0.220156,-0.237915,-0.301511,0.390716,-0.11651,-0.140452,-0.234404,-0.183973,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,3.004971,-0.077382,-0.179605,-0.122905,-0.13484
341,-0.192506,-0.512706,-0.204612,-0.237915,-0.301511,-2.559407,-0.11651,-0.140452,-0.234404,-0.183973,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,-0.332782,-0.077382,-0.179605,-0.122905,-0.13484
291,-0.132206,-0.134335,-0.190378,-0.237915,-0.301511,0.390716,-0.11651,-0.140452,-0.234404,-0.183973,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,-0.332782,-0.077382,-0.179605,-0.122905,-0.13484
46,1.362617,-1.737510,1.083651,-0.237915,-0.301511,-2.559407,-0.11651,-0.140452,-0.234404,-0.183973,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,-0.332782,-0.077382,-0.179605,-0.122905,-0.13484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,-0.349622,-0.405703,-0.174619,-0.237915,-0.301511,-2.559407,-0.11651,-0.140452,-0.234404,-0.183973,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,-0.332782,-0.077382,5.567764,-0.122905,-0.13484
72,0.744031,0.307195,2.510298,-0.237915,-0.301511,0.390716,-0.11651,-0.140452,-0.234404,-0.183973,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,-0.332782,-0.077382,-0.179605,-0.122905,-0.13484
908,-0.370173,-0.337725,-0.292367,-0.237915,-0.301511,0.390716,-0.11651,-0.140452,-0.234404,5.435573,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,-0.332782,-0.077382,5.567764,-0.122905,-0.13484
235,-0.073811,0.064236,-0.293794,-0.237915,-0.301511,0.390716,-0.11651,-0.140452,-0.234404,-0.183973,...,-0.227251,-0.038605,-0.054636,-0.054636,-0.1659,3.004971,-0.077382,-0.179605,-0.122905,-0.13484


Unnamed: 0,rank_change,revenue,profit,num. of employees,sector,city,state,newcomer,ceo_founder,ceo_woman,profitable,Market Cap
473,-97.0,6411.0,,15640.0,Telecommunications,Norwalk,CT,no,no,no,no,6763.1
509,0.0,5809.0,,1300.0,Energy,Oklahoma City,OK,no,no,no,no,11135.9
704,0.0,3633.0,,7600.0,Motor Vehicles & Parts,Plymouth,MI,no,no,no,no,463.9


In [34]:
X['profit'].mode()

0    1995.0
Name: profit, dtype: float64

In [39]:
X.dtypes

rank_change          float64
revenue              float64
profit               float64
num. of employees    float64
sector                object
city                  object
state                 object
newcomer              object
ceo_founder           object
ceo_woman             object
profitable            object
Market Cap           float64
dtype: object

In [7]:
 
models = {
    "     Linear Regression": LinearRegression(),
    "Linear Regression (L2)": Ridge(),
    "Linear Regression (L1)": Lasso(),
    "         Decision Tree": DecisionTreeRegressor(),
    "        Neural Network": MLPRegressor(),
    "         Random Forest": RandomForestRegressor(),
    "     Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

     Linear Regression trained.
Linear Regression (L2) trained.
Linear Regression (L1) trained.
         Decision Tree trained.


  model = cd_fast.enet_coordinate_descent(


        Neural Network trained.
         Random Forest trained.
     Gradient Boosting trained.


In [8]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + " RMSE: {:.2f}".format(rmse))

     Linear Regression RMSE: 7808445097466954752.00
Linear Regression (L2) RMSE: 117076.85
Linear Regression (L1) RMSE: 123321.16
         Decision Tree RMSE: 130253.87
        Neural Network RMSE: 197773.63
         Random Forest RMSE: 116999.03
     Gradient Boosting RMSE: 116340.47


In [9]:
for name, model in models.items():
    r2 = model.score(X_test, y_test)
    print(name + " R^2 Score: {:.5f}".format(r2))

     Linear Regression R^2 Score: -1655109980793177000588083200.00000
Linear Regression (L2) R^2 Score: 0.62792
Linear Regression (L1) R^2 Score: 0.58717
         Decision Tree R^2 Score: 0.53945
        Neural Network R^2 Score: -0.06178
         Random Forest R^2 Score: 0.62841
     Gradient Boosting R^2 Score: 0.63258
