In [4]:
import pandas as pd

# Data
data = {
    'R&D Spend': [45, 10, 20, 32, 40, 14, 6],
    'Administration': [1000.23, float('NaN'), 76253.86, 63408.86, 28754.33, 119943.24, 134615.46],
    'Marketing Spend': [124153.04, 110594.11, 113867.30, 129219.61, 118546.05, 156547.42, 147198.87],
    'State': ['New York', 'Florida', 'California', 'California', 'California', 'Florida', 'California'],
    'Profit': [64926.08, 146121.95, 118474.03, 97427.84, 78239.91, 132602.65, 156122.51]
}

# Buat DataFrame
df = pd.DataFrame(data)

# Identifikasi kolom dengan data kosong
columns_with_nan = df.columns[df.isnull().any()].tolist()

# Isi data kosong dengan nilai mean
for column in columns_with_nan:
    mean_value = df[column].mean()
    df[column].fillna(mean_value, inplace=True)

# Tampilkan DataFrame setelah preprocessing
print(df)


   R&D Spend  Administration  Marketing Spend       State     Profit
0         45     1000.230000        124153.04    New York   64926.08
1         10    70662.663333        110594.11     Florida  146121.95
2         20    76253.860000        113867.30  California  118474.03
3         32    63408.860000        129219.61  California   97427.84
4         40    28754.330000        118546.05  California   78239.91
5         14   119943.240000        156547.42     Florida  132602.65
6          6   134615.460000        147198.87  California  156122.51


In [5]:
import pandas as pd

# Data awal
data = {
    'R&D Spend': [45, 10, 20, 32, 40, 14, 6],
    'Administration': [1000.23, None, 76253.86, 63408.86, 28754.33, 119943.24, 134615.46],
    'Marketing Spend': [124153.04, 110594.11, 113867.30, 129219.61, 118546.05, 156547.42, 147198.87],
    'State': ['New York', 'Florida', 'California', 'California', 'California', 'Florida', 'California'],
    'Profit': [64926.08, 146121.95, 118474.03, 97427.84, 78239.91, 132602.65, 156122.51]
}

# Membuat DataFrame dari data
df = pd.DataFrame(data)

# Melakukan one-hot encoding pada kolom 'State'
df_encoded = pd.get_dummies(df, columns=['State'])

# Menampilkan hasil
print(df_encoded)


   R&D Spend  Administration  Marketing Spend     Profit  State_California  \
0         45         1000.23        124153.04   64926.08             False   
1         10             NaN        110594.11  146121.95             False   
2         20        76253.86        113867.30  118474.03              True   
3         32        63408.86        129219.61   97427.84              True   
4         40        28754.33        118546.05   78239.91              True   
5         14       119943.24        156547.42  132602.65             False   
6          6       134615.46        147198.87  156122.51              True   

   State_Florida  State_New York  
0          False            True  
1           True           False  
2          False           False  
3          False           False  
4          False           False  
5           True           False  
6          False           False  


In [6]:
import pandas as pd

# Membuat DataFrame dari data yang diberikan
data = {
    'R&D Spend': [45, 10, 20, 32, 40, 14, 6],
    'Administration': [1000.23, float('nan'), 76253.86, 63408.86, 28754.33, 119943.24, 134615.46],
    'Marketing Spend': [124153.04, 110594.11, 113867.30, 129219.61, 118546.05, 156547.42, 147198.87],
    'State': ['New York', 'Florida', 'California', 'California', 'California', 'Florida', 'California'],
    'Profit': [64926.08, 146121.95, 118474.03, 97427.84, 78239.91, 132602.65, 156122.51]
}

df = pd.DataFrame(data)

# Mengisi nilai NaN pada kolom 'Administration' dengan nilai median
administration_median = df['Administration'].median()
df['Administration'].fillna(administration_median, inplace=True)

# Menambahkan kolom baru 'Tax'
df['Tax'] = (df['Profit'] + df['Marketing Spend'] + df['Administration']) * 0.05

print(df)


   R&D Spend  Administration  Marketing Spend       State     Profit  \
0         45         1000.23        124153.04    New York   64926.08   
1         10        69831.36        110594.11     Florida  146121.95   
2         20        76253.86        113867.30  California  118474.03   
3         32        63408.86        129219.61  California   97427.84   
4         40        28754.33        118546.05  California   78239.91   
5         14       119943.24        156547.42     Florida  132602.65   
6          6       134615.46        147198.87  California  156122.51   

          Tax  
0   9503.9675  
1  16327.3710  
2  15429.7595  
3  14502.8155  
4  11277.0145  
5  20454.6655  
6  21896.8420  


In [10]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Memuat data ke dalam DataFrame
data = {
    'R&D Spend': [45, 10, 20, 32, 40, 14, 6],
    'Administration': [1000.23, np.nan, 76253.86, 63408.86, 28754.33, 119943.24, 134615.46],
    'Marketing Spend': [124153.04, 110594.11, 113867.30, 129219.61, 118546.05, 156547.42, 147198.87],
    'State': ['New York', 'Florida', 'California', 'California', 'California', 'Florida', 'California'],
    'Profit': [64926.08, 146121.95, 118474.03, 97427.84, 78239.91, 132602.65, 156122.51]
}

df = pd.DataFrame(data)

# Mengisi missing values dengan nilai rata-rata pada kolom Administration
df['Administration'].fillna(df['Administration'].mean(), inplace=True)

# Mendefinisikan kolom numerik yang akan di-scaling
numeric_cols = ['R&D Spend', 'Administration', 'Marketing Spend', 'Profit']

# Melakukan scaling menggunakan StandardScaler
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print(df)


   R&D Spend  Administration  Marketing Spend       State    Profit
0   1.496707       -1.603505        -0.277473    New York -1.519197
1  -0.980950        0.000000        -1.125502     Florida  1.024661
2  -0.273048        0.128699        -0.920784  California  0.158454
3   0.576434       -0.166970         0.039410  California -0.500921
4   1.142756       -0.964655        -0.628156  California -1.102077
5  -0.697789        1.134351         1.748600     Florida  0.601102
6  -1.264110        1.472079         1.163905  California  1.337977
