In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import norm

In [2]:
# 한글 폰트 설정
plt.rc('font', family='NanumGothic')  # 나눔고딕 폰트 사용
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지

In [3]:
df = pd.read_excel("./data/SR_ROW.xlsx", sheet_name="308명")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df['연령'].isnull()

In [None]:
df['연령'].value_counts()

In [None]:
work_columns = [col for col in df.columns if "(WORK_" in col]
print(len(work_columns))
print(work_columns)

In [None]:
df['연령대'] = df['연령'].apply(lambda x: f"{int(x//10*10)}대" if pd.notnull(x) else "알수없음")
grouped_stats = df.groupby('연령대')[work_columns].agg(['mean', 'std'])
print(grouped_stats)

In [None]:
first_work_col = work_columns[0]

plt.figure(figsize=(10, 6))
for age_group in df['연령대'].unique():
    sns.kdeplot(data=df[df['연령대'] == age_group][first_work_col], label=f"{age_group}")

plt.title(f"{first_work_col} 연령대별 분포(KDE)")
plt.xlabel(first_work_col)
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
x_vals = np.linspace(df[first_work_col].min(), df[first_work_col].max(), 100)
for age_group in df['연령대'].unique():
    mean_val = grouped_stats.loc[age_group, (first_work_col, 'mean')]
    std_val = grouped_stats.loc[age_group, (first_work_col, 'std')]
    if pd.notna(mean_val) and pd.notna(std_val) and std_val != 0:
        y_vals = norm.pdf(x_vals, mean_val, std_val)
        plt.plot(x_vals, y_vals, label=f"{age_group} (Mean={mean_val:.2f}, Std={std_val:.2f})")

plt.title(f"{first_work_col} 연령대별 정규분포 가정 PDF")
plt.xlabel(first_work_col)
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
df['연령대'] = df['연령'].apply(lambda x: f"{int(x//10*10)}대" if pd.notnull(x) else "알수없음")

count_table = pd.crosstab(df['연령대'], df['본사/현업'])
ratio_table = count_table.div(count_table.sum(axis=1), axis=0)

plt.figure(figsize=(10, 6))
ratio_table.plot(kind='bar', stacked=True)
plt.title("연령대별 본사/현업 비율")
plt.xlabel("연령대")
plt.ylabel("비율")
plt.legend(title="본사/현업", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
count_table = df['본사/현업'].value_counts()

plt.figure(figsize=(6, 6))
count_table.plot(kind='pie', autopct='%.1f%%', startangle=90)
plt.title("전체 본사/현업 비율")
plt.ylabel("")
plt.show()

In [None]:
# Filter the 근속년수 column
tenure_data = df['근속년수']

# Plot the distribution of 근속년수
plt.figure(figsize=(10, 6))
plt.hist(tenure_data, bins=20, alpha=0.75, edgecolor='k')
plt.title('근속년수 분포', fontsize=14)
plt.xlabel('근속년수 (년)', fontsize=12)
plt.ylabel('빈도', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()