# Basic Settings

In [1]:
### 한글 폰트 설치
!apt-get install -y fonts-nanum
!fc-cache -fv
!rm ~/.cache/matplotlib -rf
# 설치 후 colab의 경우 Runtime > Restart session 필요

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 1s (10.3 MB/s)
Selecting previously unselected package fonts-nanum.
(Reading database ... 123586 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/

In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import scipy
import scipy.stats as stats

# 전국 지역별 혼인, 출산율 분석

## 분석 목적 : 전국 지역별 혼인, 출산율 상위지역과 하위지역 포지셔닝

### dataset 불러오기

In [15]:
# 구글드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# 출산율 data 불러오기

# CSV 파일 경로 지정
file_path01 = '/content/drive/MyDrive/천안 공공데이터 활용 정책제안 공모전/dataset/4사분면/시군구_합계출산율__모의_연령별_출산율_20240730112405.csv'

# CSV 파일을 DataFrame으로 읽어오기
df_total_birthrate = pd.read_csv(file_path01)

In [17]:
df_total_birthrate

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
0,서울특별시,서울특별시,0.761,0.717,0.642,0.626,0.593
1,서울특별시,종로구,0.606,0.648,0.522,0.531,0.477
2,서울특별시,중구,0.737,0.780,0.688,0.634,0.606
3,서울특별시,용산구,0.747,0.713,0.634,0.664,0.596
4,서울특별시,성동구,0.909,0.855,0.783,0.764,0.723
...,...,...,...,...,...,...,...
258,제주특별자치도,제주특별자치도,1.220,1.145,1.021,0.951,0.919
259,제주특별자치도,제주시,1.264,1.150,1.035,0.969,0.926
260,제주특별자치도,서귀포시,1.081,1.130,0.977,0.891,0.898
261,제주특별자치도,북제주군,0.000,0.000,0.000,0.000,0.000


In [18]:
# 혼인율 data 불러오기

# CSV 파일 경로 지정
file_path02 = '/content/drive/MyDrive/천안 공공데이터 활용 정책제안 공모전/dataset/4사분면/시도_시군구_월별_혼인_20240730112413.csv'

# CSV 파일을 DataFrame으로 읽어오기
df_marriage_rate = pd.read_csv(file_path02)

In [19]:
df_marriage_rate

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
0,,서울특별시,52005,48261,44746,37012,35752
1,서울특별시,종로구,676,590,565,478,486
2,서울특별시,중구,778,722,653,590,511
3,서울특별시,용산구,1400,1303,1175,995,948
4,서울특별시,성동구,2000,1758,1573,1272,1179
...,...,...,...,...,...,...,...
244,제주특별자치도,제주시,2666,2457,2214,1933,2034
245,제주특별자치도,서귀포시,972,901,767,728,684
246,제주특별자치도,북제주군,0,0,0,0,0
247,제주특별자치도,남제주군,0,0,0,0,0


---

### 전국 지역별 출산율 분석

In [20]:
df_total_birthrate

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
0,서울특별시,서울특별시,0.761,0.717,0.642,0.626,0.593
1,서울특별시,종로구,0.606,0.648,0.522,0.531,0.477
2,서울특별시,중구,0.737,0.780,0.688,0.634,0.606
3,서울특별시,용산구,0.747,0.713,0.634,0.664,0.596
4,서울특별시,성동구,0.909,0.855,0.783,0.764,0.723
...,...,...,...,...,...,...,...
258,제주특별자치도,제주특별자치도,1.220,1.145,1.021,0.951,0.919
259,제주특별자치도,제주시,1.264,1.150,1.035,0.969,0.926
260,제주특별자치도,서귀포시,1.081,1.130,0.977,0.891,0.898
261,제주특별자치도,북제주군,0.000,0.000,0.000,0.000,0.000


In [21]:
# 상위 행정구역 리스트
korea_regions = [
    "서울특별시", "부산광역시", "대구광역시", "인천광역시", "광주광역시", "대전광역시", "울산광역시",
    "세종특별자치시", "경기도", "강원특별자치도", "충청북도", "충청남도", "전북특별자치도", "전라남도",
    "경상북도", "경상남도", "제주특별자치도"
]

In [22]:
df_total_birthrate['시군구별']

0        서울특별시
1          종로구
2           중구
3          용산구
4          성동구
        ...   
258    제주특별자치도
259        제주시
260       서귀포시
261       북제주군
262       남제주군
Name: 시군구별, Length: 263, dtype: object

In [23]:
# 행정구역 추출 후 제외

# '시군구별' column에서 대한민국 행정구역만 추출하여 새로운 DataFrame 생성
df_regions_birthrate = df_total_birthrate[df_total_birthrate['시군구별'].isin(korea_regions)]

# 기존 DataFrame에서 행정구역을 제외한 나머지 행들만 남기기
df_others_birthratee = df_total_birthrate[~df_total_birthrate['시군구별'].isin(korea_regions)]

In [24]:
# 인덱스를 재설정
df_others_birthratee = df_others_birthratee.reset_index(drop=True)
df_others_birthratee

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
0,서울특별시,종로구,0.606,0.648,0.522,0.531,0.477
1,서울특별시,중구,0.737,0.780,0.688,0.634,0.606
2,서울특별시,용산구,0.747,0.713,0.634,0.664,0.596
3,서울특별시,성동구,0.909,0.855,0.783,0.764,0.723
4,서울특별시,광진구,0.713,0.652,0.527,0.525,0.461
...,...,...,...,...,...,...,...
241,경상남도,합천군,0.935,1.110,0.927,0.805,1.011
242,제주특별자치도,제주시,1.264,1.150,1.035,0.969,0.926
243,제주특별자치도,서귀포시,1.081,1.130,0.977,0.891,0.898
244,제주특별자치도,북제주군,0.000,0.000,0.000,0.000,0.000


In [26]:
# 2022년도 출생률의 평균 계산
mean_2022 = df_others_birthratee['2022'].mean()

In [28]:
# 새로운 컬럼 추가 및 평균보다 낮은 지역은 0, 높은 지역은 1로 할당
df_others_birthratee['2022_평균비교'] = df_others_birthratee['2022'].apply(lambda x: 1 if x > mean_2022 else 0)

# 2022년도 출생률 기준으로 오름차순 정렬
df_others_birthratee = df_others_birthratee.sort_values(by='2022').reset_index(drop=True)

df_others_birthratee

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022,2022_평균비교
0,제주특별자치도,남제주군,0.000,0.000,0.000,0.000,0.000,0
1,인천광역시,남구,0.000,0.000,0.000,0.000,0.000,0
2,경기도,양주군,0.000,0.000,0.000,0.000,0.000,0
3,경기도,여주군,0.000,0.000,0.000,0.000,0.000,0
4,경기도,화성군,0.000,0.000,0.000,0.000,0.000,0
...,...,...,...,...,...,...,...,...
241,강원특별자치도,양구군,1.358,1.473,1.428,1.518,1.434,1
242,경상북도,의성군,1.630,1.761,1.598,1.375,1.457,1
243,경상북도,군위군,1.179,1.233,1.014,1.060,1.486,1
244,전북특별자치도,임실군,1.397,1.416,1.749,1.803,1.560,1


In [32]:
df_others_birthratee_filtered = df_others_birthratee.loc[:,['Unnamed: 0','시군구별','2022', '2022_평균비교']]
df_others_birthratee_filtered

Unnamed: 0.1,Unnamed: 0,시군구별,2022,2022_평균비교
0,제주특별자치도,남제주군,0.000,0
1,인천광역시,남구,0.000,0
2,경기도,양주군,0.000,0
3,경기도,여주군,0.000,0
4,경기도,화성군,0.000,0
...,...,...,...,...
241,강원특별자치도,양구군,1.434,1
242,경상북도,의성군,1.457,1
243,경상북도,군위군,1.486,1
244,전북특별자치도,임실군,1.560,1


In [33]:
# 2022년도 데이터가 0인 행을 삭제
df_others_birthratee_filtered = df_others_birthratee_filtered[df_others_birthratee_filtered['2022'] != 0]
df_others_birthratee_filtered

Unnamed: 0.1,Unnamed: 0,시군구별,2022,2022_평균비교
15,서울특별시,관악구,0.422,0
16,대구광역시,서구,0.459,0
17,서울특별시,광진구,0.461,0
18,부산광역시,중구,0.462,0
19,서울특별시,종로구,0.477,0
...,...,...,...,...
241,강원특별자치도,양구군,1.434,1
242,경상북도,의성군,1.457,1
243,경상북도,군위군,1.486,1
244,전북특별자치도,임실군,1.560,1


In [45]:
df_others_birthratee_filtered.describe()

Unnamed: 0,2022,2022_평균비교
count,231.0,231.0
mean,0.867251,0.614719
std,0.209211,0.487719
min,0.422,0.0
25%,0.7275,0.0
50%,0.855,1.0
75%,0.962,1.0
max,1.803,1.0


In [47]:
# 2022년도 출생률의 사분위수 계산
q25 = df_others_birthratee_filtered['2022'].quantile(0.25)
q50 = df_others_birthratee_filtered['2022'].quantile(0.50)
q75 = df_others_birthratee_filtered['2022'].quantile(0.75)

# 새로운 컬럼 추가 및 구간별 값 할당
def assign_quartile(value):
    if value <= q25:
        return 0
    elif value <= q50:
        return 1
    elif value <= q75:
        return 3
    else:
        return 4

df_others_birthratee_filtered['2022_구간비교'] = df_others_birthratee_filtered['2022'].apply(assign_quartile)

# 2022년도 출생률 기준으로 오름차순 정렬
df_others_birthratee_sorted = df_others_birthratee_filtered.sort_values(by='2022').reset_index(drop=True)

df_others_birthratee_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_others_birthratee_filtered['2022_구간비교'] = df_others_birthratee_filtered['2022'].apply(assign_quartile)


Unnamed: 0.1,Unnamed: 0,시군구별,2022,2022_평균비교,2022_구간비교
0,서울특별시,관악구,0.422,0,0
1,대구광역시,서구,0.459,0,0
2,서울특별시,광진구,0.461,0,0
3,부산광역시,중구,0.462,0,0
4,서울특별시,종로구,0.477,0,0
...,...,...,...,...,...
226,강원특별자치도,양구군,1.434,1,4
227,경상북도,의성군,1.457,1,4
228,경상북도,군위군,1.486,1,4
229,전북특별자치도,임실군,1.560,1,4


---

### 전국 지역별 혼인율 분석

In [34]:
df_marriage_rate

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
0,,서울특별시,52005,48261,44746,37012,35752
1,서울특별시,종로구,676,590,565,478,486
2,서울특별시,중구,778,722,653,590,511
3,서울특별시,용산구,1400,1303,1175,995,948
4,서울특별시,성동구,2000,1758,1573,1272,1179
...,...,...,...,...,...,...,...
244,제주특별자치도,제주시,2666,2457,2214,1933,2034
245,제주특별자치도,서귀포시,972,901,767,728,684
246,제주특별자치도,북제주군,0,0,0,0,0
247,제주특별자치도,남제주군,0,0,0,0,0


In [38]:
# 행정구역 추출 후 제외

# '시군구별' column에서 대한민국 행정구역만 추출하여 새로운 DataFrame 생성
df_regions_marriagerate = df_marriage_rate[df_marriage_rate['시군구별'].isin(korea_regions)]

# 기존 DataFrame에서 행정구역을 제외한 나머지 행들만 남기기
df_others_marriagerate = df_marriage_rate[~df_marriage_rate['시군구별'].isin(korea_regions)]

In [39]:
df_others_marriagerate

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
1,서울특별시,종로구,676,590,565,478,486
2,서울특별시,중구,778,722,653,590,511
3,서울특별시,용산구,1400,1303,1175,995,948
4,서울특별시,성동구,2000,1758,1573,1272,1179
5,서울특별시,광진구,2164,1893,1661,1399,1317
...,...,...,...,...,...,...,...
244,제주특별자치도,제주시,2666,2457,2214,1933,2034
245,제주특별자치도,서귀포시,972,901,767,728,684
246,제주특별자치도,북제주군,0,0,0,0,0
247,제주특별자치도,남제주군,0,0,0,0,0


In [40]:
# 인덱스를 재설정
df_others_marriagerate = df_others_marriagerate.reset_index(drop=True)
df_others_marriagerate

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
0,서울특별시,종로구,676,590,565,478,486
1,서울특별시,중구,778,722,653,590,511
2,서울특별시,용산구,1400,1303,1175,995,948
3,서울특별시,성동구,2000,1758,1573,1272,1179
4,서울특별시,광진구,2164,1893,1661,1399,1317
...,...,...,...,...,...,...,...
227,제주특별자치도,제주시,2666,2457,2214,1933,2034
228,제주특별자치도,서귀포시,972,901,767,728,684
229,제주특별자치도,북제주군,0,0,0,0,0
230,제주특별자치도,남제주군,0,0,0,0,0


In [41]:
# 2022년도 출생률 기준으로 오름차순 정렬
df_others_marriagerate = df_others_marriagerate.sort_values(by='2022').reset_index(drop=True)

df_others_marriagerate

Unnamed: 0.1,Unnamed: 0,시군구별,2018,2019,2020,2021,2022
0,충청북도,청주시,0,0,0,0,0
1,경기도,포천군,0,0,0,0,0
2,경기도,여주군,0,0,0,0,0
3,경상남도,창원시,0,0,0,0,0
4,경상남도,마산시,0,0,0,0,0
...,...,...,...,...,...,...,...
227,국외,국외,5283,5248,3385,3163,3804
228,경기도,성남시,5228,4453,4269,3565,3865
229,경기도,고양시,4317,4440,4188,4172,3905
230,경기도,화성시,4933,4788,4547,4331,4124


In [42]:
df_others_marriagerate_filtered = df_others_marriagerate.loc[:,['Unnamed: 0','시군구별','2022']]
df_others_marriagerate_filtered

Unnamed: 0.1,Unnamed: 0,시군구별,2022
0,충청북도,청주시,0
1,경기도,포천군,0
2,경기도,여주군,0
3,경상남도,창원시,0
4,경상남도,마산시,0
...,...,...,...
227,국외,국외,3804
228,경기도,성남시,3865
229,경기도,고양시,3905
230,경기도,화성시,4124


In [43]:
# 2022년도 데이터가 0인 행을 삭제
df_others_marriagerate_filtered = df_others_marriagerate_filtered[df_others_marriagerate_filtered['2022'] != 0]
df_others_marriagerate_filtered

Unnamed: 0.1,Unnamed: 0,시군구별,2022
17,경상북도,울릉군,31
18,경상북도,군위군,33
19,경상북도,영양군,39
20,전북특별자치도,구례군,49
21,전북특별자치도,곡성군,49
...,...,...,...
227,국외,국외,3804
228,경기도,성남시,3865
229,경기도,고양시,3905
230,경기도,화성시,4124


In [46]:
df_others_marriagerate_filtered.describe()

Unnamed: 0,2022
count,215.0
mean,876.581395
std,921.027875
min,31.0
25%,167.0
50%,624.0
75%,1280.0
max,5111.0


In [48]:
# 2022년도 출생률의 사분위수 계산
q25 = df_others_marriagerate_filtered['2022'].quantile(0.25)
q50 = df_others_marriagerate_filtered['2022'].quantile(0.50)
q75 = df_others_marriagerate_filtered['2022'].quantile(0.75)

# 새로운 컬럼 추가 및 구간별 값 할당
def assign_quartile(value):
    if value <= q25:
        return 0
    elif value <= q50:
        return 1
    elif value <= q75:
        return 3
    else:
        return 4

df_others_marriagerate_filtered['2022_구간비교'] = df_others_marriagerate_filtered['2022'].apply(assign_quartile)

# 2022년도 출생률 기준으로 오름차순 정렬
df_others_marriagerate_sorted = df_others_marriagerate_filtered.sort_values(by='2022').reset_index(drop=True)

df_others_marriagerate_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_others_marriagerate_filtered['2022_구간비교'] = df_others_marriagerate_filtered['2022'].apply(assign_quartile)


Unnamed: 0.1,Unnamed: 0,시군구별,2022,2022_구간비교
0,경상북도,울릉군,31,0
1,경상북도,군위군,33,0
2,경상북도,영양군,39,0
3,전북특별자치도,구례군,49,0
4,전북특별자치도,곡성군,49,0
...,...,...,...,...
210,국외,국외,3804,4
211,경기도,성남시,3865,4
212,경기도,고양시,3905,4
213,경기도,화성시,4124,4
