# 중요 단어 찾기
## 1. 오즈비
오즈비(Odds Ratio, OR)는 두 가지 사건이 발생할 확률을 비교하는 통계적 방법으로, 주로 의학, 사회과학 등의 연구에서 두 변수 간의 관련성을 평가할 때 사용됩니다. 오즈비는 특정 사건이 발생할 확률과 발생하지 않을 확률의 비율을 나타내며, 이를 통해 두 그룹 간의 상대적 중요성을 비교할 수 있습니다.

In [1]:
# 빈도분석

from konlpy.tag import Okt
import pandas as pd

f = open("dialogue_org.txt", 'rt', encoding="utf-8")
org = f.read().split("\n") 
f.close()

f = open("dialogue_loan.txt", 'rt', encoding="utf-8")
loan = f.read().split("\n")
f.close()

okt = Okt()

box = []
for i in org:
    t = i.strip()
    for j in okt.nouns(t):
        box.append(['수사기관사칭', j])

for i in loan:
    t = i.strip()
    for j in okt.nouns(t):
        box.append(['대출사기', j])

df = pd.DataFrame(box, columns = ['Type', 'Word'])
df

Unnamed: 0,Type,Word
0,수사기관사칭,또
1,수사기관사칭,현재
2,수사기관사칭,지금
3,수사기관사칭,원
4,수사기관사칭,불법
...,...,...
195228,대출사기,통화
195229,대출사기,다시
195230,대출사기,한번
195231,대출사기,네


In [2]:
df['N'] = 1

# pivot_table : 내가 원하는 행과 열을 설정해서 집계값 확인
df2 = df.pivot_table(index = ['Type', 'Word'], values = 'N', aggfunc="sum")
df2 = df2.reset_index()
df2

Unnamed: 0,Type,Word,N
0,대출사기,가가,3
1,대출사기,가게,3
2,대출사기,가격,3
3,대출사기,가결,3
4,대출사기,가계,1
...,...,...,...
8888,수사기관사칭,흠집,1
8889,수사기관사칭,흥원,3
8890,수사기관사칭,희망,2
8891,수사기관사칭,흰색,2


In [3]:
df3 = df2.pivot_table(index = "Word", columns = "Type", 
                values = 'N', aggfunc="sum", fill_value=0)

df3['ratio_수사기관사칭'] = (df3['수사기관사칭'] + 1) / (sum(df3['대출사기']) + 1)
df3['ratio_대출사기'] = (df3['대출사기'] + 1) / (sum(df3['수사기관사칭']) + 1)
df3['odds_ratio'] = df3['ratio_수사기관사칭'] / df3['ratio_대출사기']
df3 = df3.sort_values("odds_ratio")

In [4]:
df3.head(10)

Type,대출사기,수사기관사칭,ratio_수사기관사칭,ratio_대출사기,odds_ratio
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
보증,94,0,1.8e-05,0.000673,0.027488
평점,68,0,1.8e-05,0.000489,0.037845
완납,68,0,1.8e-05,0.000489,0.037845
고객,1968,43,0.000814,0.013947,0.058353
부결,41,0,1.8e-05,0.000298,0.062174
승인,123,2,5.5e-05,0.000878,0.063177
중도,38,0,1.8e-05,0.000276,0.066957
연체,74,1,3.7e-05,0.000531,0.069635
채무,36,0,1.8e-05,0.000262,0.070576
계약서,35,0,1.8e-05,0.000255,0.072537


In [5]:
df3.tail(10)

Type,대출사기,수사기관사칭,ratio_수사기관사칭,ratio_대출사기,odds_ratio
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
검찰청,0,144,0.002682,7e-06,378.640912
검찰,0,150,0.002793,7e-06,394.308812
추적,0,212,0.00394,7e-06,556.210444
연루,0,224,0.004162,7e-06,587.546243
압수,0,227,0.004217,7e-06,595.380193
범죄,0,238,0.004421,7e-06,624.104676
동결,0,249,0.004624,7e-06,652.829159
검거,0,258,0.004791,7e-06,676.331009
발견,0,328,0.006086,7e-06,859.123173
수사,0,461,0.008546,7e-06,1206.428286
