In [1]:
import pandas as pd
import numpy as np

In [2]:
sales = pd.DataFrame({
    'customer_id': [101, 102, 103, 104, 105, 106],
    'country': ['Korea', 'USA', 'Korea', 'Japan', 'China', np.nan],
    'product': ['Shoes', 'Shoes', 'Hat', 'Hat', 'Bag', 'Bag'],
    'quantity': [3, 1, 2, 5, 1, 2],
    'revenue': [120000, 60000, 40000, np.nan, 80000, 50000]
})

customers = pd.DataFrame({
    'customer_id': [101, 102, 103, 104, 105, 107],
    'name': ['Kim', 'Smith', 'Lee', 'Tanaka', 'Li', 'Wang'],
    'age': [32, 45, 28, 38, 41, 36]
})

sales.to_csv('sales.csv', index=False)
customers.to_csv('customers.csv', index=False)

In [3]:
df = pd.read_csv('sales.csv')
df.head()

Unnamed: 0,customer_id,country,product,quantity,revenue
0,101,Korea,Shoes,3,120000.0
1,102,USA,Shoes,1,60000.0
2,103,Korea,Hat,2,40000.0
3,104,Japan,Hat,5,
4,105,China,Bag,1,80000.0


In [4]:
df['revenue']

0    120000.0
1     60000.0
2     40000.0
3         NaN
4     80000.0
5     50000.0
Name: revenue, dtype: float64

In [5]:
df[['product', 'revenue']]

Unnamed: 0,product,revenue
0,Shoes,120000.0
1,Shoes,60000.0
2,Hat,40000.0
3,Hat,
4,Bag,80000.0
5,Bag,50000.0


In [7]:
df_high = df[df['revenue']>80000]
df_high

Unnamed: 0,customer_id,country,product,quantity,revenue
0,101,Korea,Shoes,3,120000.0


In [9]:
df[(df['country']=='Korea') & (df['revenue']>=80000)]

Unnamed: 0,customer_id,country,product,quantity,revenue
0,101,Korea,Shoes,3,120000.0


In [10]:
df.isna().sum()

customer_id    0
country        1
product        0
quantity       0
revenue        1
dtype: int64

In [12]:
# 결측지 제거
df_clean = df.dropna(subset=['revenue'])
df_clean

Unnamed: 0,customer_id,country,product,quantity,revenue
0,101,Korea,Shoes,3,120000.0
1,102,USA,Shoes,1,60000.0
2,103,Korea,Hat,2,40000.0
4,105,China,Bag,1,80000.0
5,106,,Bag,2,50000.0


In [14]:
df['revenue'].mean()

np.float64(70000.0)

In [13]:
df['revenue'] = df['revenue'].fillna(df['revenue'].mean())
df

Unnamed: 0,customer_id,country,product,quantity,revenue
0,101,Korea,Shoes,3,120000.0
1,102,USA,Shoes,1,60000.0
2,103,Korea,Hat,2,40000.0
3,104,Japan,Hat,5,70000.0
4,105,China,Bag,1,80000.0
5,106,,Bag,2,50000.0


In [19]:
# Step 4. 그룹화와 집계
df.groupby('country')['revenue'].mean()

country
China    80000.0
Japan    70000.0
Korea    80000.0
USA      60000.0
Name: revenue, dtype: float64

In [20]:
df.groupby(['country', 'product'])['revenue'].agg(['mean', 'sum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,sum
country,product,Unnamed: 2_level_1,Unnamed: 3_level_1
China,Bag,80000.0,80000.0
Japan,Hat,70000.0,70000.0
Korea,Hat,40000.0,40000.0
Korea,Shoes,120000.0,120000.0
USA,Shoes,60000.0,60000.0


In [21]:
df.sort_values(by='revenue', ascending=False)

Unnamed: 0,customer_id,country,product,quantity,revenue
0,101,Korea,Shoes,3,120000.0
4,105,China,Bag,1,80000.0
3,104,Japan,Hat,5,70000.0
1,102,USA,Shoes,1,60000.0
5,106,,Bag,2,50000.0
2,103,Korea,Hat,2,40000.0
