Capstone Two - Pre-processing and Training Data Development

In [9]:
import pandas as pd

# Load the dataset
file_path = 'Largest companies in world.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe and its summary
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rank              2050 non-null   object
 1   organizationName  2050 non-null   object
 2   country           2050 non-null   object
 3   revenue           2049 non-null   object
 4   profits           2049 non-null   object
 5   assets            2049 non-null   object
 6   marketValue       2049 non-null   object
dtypes: object(7)
memory usage: 112.3+ KB


(  rank                          organizationName        country   revenue  \
 0    1                            JPMorgan Chase  United States  179.93 B   
 1    2  Saudi Arabian Oil Company (Saudi Aramco)   Saudi Arabia  589.47 B   
 2    3                                      ICBC          China  216.77 B   
 3    4                   China Construction Bank          China  203.08 B   
 4    5                Agricultural Bank of China          China  186.14 B   
 
     profits      assets marketValue  
 0    41.8 B   3,744.3 B    399.59 B  
 1  156.36 B    660.99 B  2,055.22 B  
 2   52.47 B  6,116.82 B    203.01 B  
 3   48.25 B  4,977.48 B    172.99 B  
 4   37.92 B  5,356.86 B    141.82 B  ,
 None)

In [10]:
# Function to convert financial metrics to numeric
def convert_to_numeric(value):
    if pd.isnull(value):
        return None
    try:
        # Remove 'B' and convert to billions
        return float(value.replace('B', '')) * 1e9
    except:
        return None

# Apply conversion to the relevant columns
for column in ['revenue', 'profits', 'assets', 'marketValue']:
    df[column] = df[column].apply(convert_to_numeric)

# Check the conversion and updated data types
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rank              2050 non-null   object 
 1   organizationName  2050 non-null   object 
 2   country           2050 non-null   object 
 3   revenue           2010 non-null   float64
 4   profits           1086 non-null   float64
 5   assets            1987 non-null   float64
 6   marketValue       1995 non-null   float64
dtypes: float64(4), object(3)
memory usage: 112.3+ KB


(  rank                          organizationName        country       revenue  \
 0    1                            JPMorgan Chase  United States  1.799300e+11   
 1    2  Saudi Arabian Oil Company (Saudi Aramco)   Saudi Arabia  5.894700e+11   
 2    3                                      ICBC          China  2.167700e+11   
 3    4                   China Construction Bank          China  2.030800e+11   
 4    5                Agricultural Bank of China          China  1.861400e+11   
 
         profits        assets   marketValue  
 0  4.180000e+10           NaN  3.995900e+11  
 1  1.563600e+11  6.609900e+11           NaN  
 2  5.247000e+10           NaN  2.030100e+11  
 3  4.825000e+10           NaN  1.729900e+11  
 4  3.792000e+10           NaN  1.418200e+11  ,
 None)

In [11]:
# Create dummy variables for the 'country' column
df_dummies = pd.get_dummies(df, columns=['country'], drop_first=True)

# Display the first few rows to verify dummy creation
df_dummies.head()


Unnamed: 0,rank,organizationName,revenue,profits,assets,marketValue,country_Australia,country_Austria,country_Belgium,country_Bermuda,...,country_Sweden,country_Switzerland,country_Taiwan,country_Thailand,country_Turkey,country_United Arab Emirates,country_United Kingdom,country_United States,country_Uruguay,country_Vietnam
0,1,JPMorgan Chase,179930000000.0,41800000000.0,,399590000000.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,Saudi Arabian Oil Company (Saudi Aramco),589470000000.0,156360000000.0,660990000000.0,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,ICBC,216770000000.0,52470000000.0,,203010000000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,China Construction Bank,203080000000.0,48250000000.0,,172990000000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Agricultural Bank of China,186140000000.0,37920000000.0,,141820000000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# We exclude 'marketValue' from features to be standardized because it's our target
features_to_scale = ['revenue', 'profits', 'assets']
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale].fillna(0))

# Prepare features (X) and target (y) for the model
X = df.drop(['rank', 'organizationName', 'marketValue'], axis=1)  # Exclude the target
y = df['marketValue']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print shapes of the splits for verification
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)

Training features shape: (1435, 4)
Testing features shape: (616, 4)
Training target shape: (1435,)
Testing target shape: (616,)
