In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder


# 가상 데이터 생성
data = {
    'TransactionID': range(1, 21),
    'CustomerID': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'PurchaseAmount': [250, -50, 3000000, 450, 0, 300, 200, 150, -10, 800, 50, 75, 400, np.nan, 600, 1000, 20, 5000, 150, 80],
    'PurchaseDate': pd.date_range(start='2024-01-01', periods=20, freq='ME').tolist(),
    'ProductCategory': ['Electronics', 'Clothing', 'Electronics', 'Home', 'Electronics', 'Home', 'Clothing', 'Home', 'Clothing', 'Electronics', 'Electronics', 'Home', 'Clothing', 'Electronics', 'Home', 'Home', 'Clothing', 'Electronics', 'Home', 'Electronics'],
    'CustomerAge': [25, 35, 45, np.nan, 22, 29, 33, 41, 27, 36, 28, 34, 42, 39, 24, 30, 32, 40, 38, 26],
    'CustomerGender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', np.nan, 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'ReviewScore': [5, np.nan, 4, 3, 2, 5, 3, 4, 1, 2, np.nan, 4, 5, 3, 4, np.nan, 1, 5, 2, 4]
}

df = pd.DataFrame(data)


In [27]:
# 1. 결측값 처리
df['PurchaseAmount'].fillna(df['PurchaseAmount'].mean(), inplace=True)
df['CustomerAge'].fillna(df['CustomerAge'].median(), inplace=True)
df['CustomerGender'].fillna(df['CustomerGender'].mode()[0], inplace=True)
df['ReviewScore'].fillna(df['ReviewScore'].mean(), inplace=True)

# 2. 이상치 처리
Q1 = df['PurchaseAmount'].quantile(0.25)
Q3 = df['PurchaseAmount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df['PurchaseAmount'] = df['PurchaseAmount'].clip(lower_bound, upper_bound)
df['PurchaseAmount'] = df['PurchaseAmount'].clip(lower=0)  # 음수 값을 0으로 처리

# 3. 중복 데이터 제거
df.drop_duplicates(subset=['TransactionID'], keep='first', inplace=True)

# 4. 데이터 타입 변환
df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])

# 5. 정규화
scaler = MinMaxScaler()
df['PurchaseAmount_Normalized'] = scaler.fit_transform(df[['PurchaseAmount']])

# 6. 범주형 데이터 인코딩
le = LabelEncoder()
df['ProductCategory_Encoded'] = le.fit_transform(df['ProductCategory'])
df['CustomerGender_Encoded'] = le.fit_transform(df['CustomerGender'])

# 7. 샘플링
sample_df = df.sample(n=5, random_state=42)

print(sample_df)

    TransactionID  CustomerID  PurchaseAmount PurchaseDate ProductCategory  \
0               1         101         250.000   2024-01-31     Electronics   
17             18         108        1521.875   2025-06-30     Electronics   
15             16         106        1000.000   2025-04-30            Home   
1               2         102           0.000   2024-02-29        Clothing   
8               9         109           0.000   2024-09-30        Clothing   

    CustomerAge CustomerGender  ReviewScore  PurchaseAmount_Normalized  \
0          25.0           Male     5.000000                   0.164271   
17         40.0         Female     5.000000                   1.000000   
15         30.0         Female     3.352941                   0.657084   
1          35.0         Female     3.352941                   0.000000   
8          27.0           Male     1.000000                   0.000000   

    ProductCategory_Encoded  CustomerGender_Encoded  
0                         1     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['PurchaseAmount'].fillna(df['PurchaseAmount'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CustomerAge'].fillna(df['CustomerAge'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm