# Topic Modeling of Case Law on Unemployment using LDA
조희진, 맹주리, 우정연
* Background: Rise of interest in AI in the legal field
* Objective: To identify unfair dismissal case law through Latend Diriclet Allocation (LDA) analysis 
* Process: data preprocessing (drop unwanted columns, define stop words) -> compare stemming methods -> perform LDA
* Conclusion
    * Divided into 10 topics
    * Provided new insights to understanding the grouped cases
* Takeaways
    * Tokenizing needs to be more considerate of the legal characteristics
    * May need to perform more sophisticated preprocessing so that unimportant information is not included in the data

In [2]:
import pandas as pd
import numpy as np
import csv

## Data

In [3]:
# Case law data from Korea, preprocessed
data=pd.read_csv("판례본문_해고_수정.csv")
data=data.drop(columns=['판례정보일련번호', '사건번호', '선고일자', '법원종류코드', '판시사항', '판결요지', '참조조문', '참조판례', '사건명'])
data

Unnamed: 0,판례내용
0,"【원고, 상고인】 원고 1 외 1인 (소송대리인 법무법인 여는 담당변호사 정기호 외..."
1,"【원고, 상고인】 원고 (소송대리인 법무법인 어울림 담당변호사 구은미 외 2인)【피..."
2,"【원고, 상고인】 원고 1 외 1인 (소송대리인 법무법인 여는 담당변호사 탁선호 외..."
3,"【원고, 피상고인 겸 상고인】 원고 (소송대리인 법무법인 세진 담당변호사 권구철 외..."
4,【원고】 원고 (소송대리인 법무법인 서린 담당변호사 조석영 외 1인)【피고】 수원지...
...,...
730,"【원고, 상고인 겸 피상고인】 왕호식 외 31인【피고, 상고인 겸 피상고인】 ..."
731,"【원고, 상고인】 【피고, 피상고인】 유건묵【원심판결】 \n제1심 서울민사지법,..."
732,"【원고, 피상고인】 유건목 외 1인【피고, 상고인】 한국전력주식회사【원심판결..."
733,"【원고, 상고인】 【피고, 피상고인】 【원심판결】 \n제1심 서울지방, 제2심 \n..."


In [4]:
data.to_csv("판례내용.csv", index=False)

We compared various tag classes (wrappers) from the KoNLPy package for text tokenizing.\
The tokenizing processes for Kkma and Komoran are not included in this file.

## Twitter 

In [28]:
import ujson
from konlpy.tag import Twitter

def split_sentences(text):
    
    text = text.strip().replace(". ", ".\n").replace("? ", "?\n").replace("! ", "!\n")
    sentences = text.splitlines()
    
    return sentences

def get_pos(analyzer, text):
    
    morph_anals = []
    sentences = split_sentences(text)                       
    
    for sentence in sentences:
        morph_anal = analyzer.pos(sentence)                
        morph_anals.append(morph_anal)
        
    return morph_anals

def read_text(input_file_name):        
    
    key_names = ['판례내용']
    data = []                        

    with open(input_file_name, "r", encoding="utf-8", newline="") as input_file:
        reader = csv.reader(input_file)
        for row_num, row in enumerate(reader):
            if row_num == 0:
                continue

            reviews = {}

            for key_name, val in zip(key_names, row):
                reviews[key_name] = val

            data.append(reviews)

    return data

def pos_review(data):  
    
    data_pos = []
    twitter = Twitter()
    
    for reviews in data:
        body = reviews["판례내용"]                     
        review_pos = get_pos(twitter, body)        
        reviews["판례내용_pos"] = review_pos              
        data_pos.append(reviews)

    return data_pos

def write_pos_review(output_file_name, data_pos):       
    
    with open(output_file_name, "w", encoding="utf-8") as output_file:
        for review_pos in data_pos:
            review_str = ujson.dumps(review_pos, ensure_ascii=False)
            print(review_str, file=output_file)
            

def main(): 
    
    input_file_name = r"판례내용.csv"
    output_file_name = r"twitter_pos_판례내용.txt"
    
    data = read_text(input_file_name)                                         
    data_pos = pos_review(data)                                           
    write_pos_review(output_file_name, data_pos)            
            
main()

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [44]:
def read_documents(input_file_name):
    
    documents = []

    with open(input_file_name, "r", encoding="utf-8") as input_file:
        for line in input_file:
            json_obj = ujson.loads(line)
            text_pos = json_obj[POS_KEY]
            
            words = []

            stop_words = ['징계','원고', '피고', '상고인', '피상고인', '다음', '부터', '참가인', '소외', '따르','등','회사', '상고',
                          '위원회',
                          '해고','근로자', '이','판결','하','있','것','위','선고','제','대하','원심','없','같','수','의하','관하',
                          '항','받','이', '있', '하', '것', '들', '그', '되', '수',
                          '이', '보', '않', '없', '나', '사람', '주', '아니',
                          '등', '같', '우리', '때', '년', '가', '한', '지', '대하',
                          '오', '말', '일', '그렇', '위하', '및',
                          '원고', '상고인', '피고', '피상고인', '제', '위', '항', '바', '관', '점',
                          '불', '중', '볼', '후', '함', '명', '의', '각', '호', '정', '자', '경', '고', '것이므', '해', '금',
                          '부터', '다음', '뿐','조','증인','증', '재심', '호증','제법','단체','보조참가인','항소','위원', '공사','소론',
                          '심','임자','소정','아니하','을','주장','경우','사실','당원','고인','피상','서울','대리인','항소인','문',
                          '고등','변호사','유','주문','원','위원회',
                          '대법원','외','재판장','이유','법관','기각','대법관','김','가집행','부담','비용','판사','소론','패소자',
                          '박','철','주심','한편','위원','외인','소',
                          '측','해당','참조','카','다','누','부장','정자','동인','화학','합계','지법','급','믿','법','달',
                          '동부','구','금','소외','을','심','보조참가','원심']
            
            for sent_pos in text_pos:
                for word, pos in sent_pos:
                    if pos not in FEATURE_POS:
                        continue

                    words.append(word)
                    
            words_new = []
            for word in words:
                if word not in stop_words:
                    words_new.append(word)

            documents.append(words_new)
         
    return documents

In [30]:
FEATURE_POS = ["Noun"]
POS_KEY = "판례내용_pos"

input_file_name = r"twitter_pos_판례내용.txt"
documents = read_documents(input_file_name)

print(len(documents))

735


In [31]:
from gensim import corpora
from gensim import models

dictionary = corpora.Dictionary(documents)

n_items = len(dictionary)
doc = [dictionary.doc2bow(text) for text in documents] 
tfidf = models.TfidfModel(doc)
corpus = tfidf[doc]

In [32]:
from gensim import corpora
from gensim import models

dictionary = corpora.Dictionary(documents)

n_items = len(dictionary)
doc = [dictionary.doc2bow(text) for text in documents] 
tfidf = models.TfidfModel(doc)
corpus = tfidf[doc]

In [33]:
NUM_TOPICS = 10
lda_model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary,
                                     passes=2, iterations=10, chunksize=350, eval_every=None, random_state=0)

In [34]:
import pyLDAvis
import pyLDAvis.gensim_models

vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

pyLDAvis.save_html(vis_data, "tfidf_twitter_10.html")     

## Kkma

In [35]:
FEATURE_POS = ["NNG","NNP","NNB", "VV", "VA", "VXV", "VXA", "VCP", "VCN","MM"]
POS_KEY = "판례_pos"

input_file_name = r"pos_kkma.txt"
documents = read_documents(input_file_name)

print(len(documents))

9250


In [36]:
from gensim import corpora
from gensim import models

dictionary = corpora.Dictionary(documents)

n_items = len(dictionary)
doc = [dictionary.doc2bow(text) for text in documents]
tfidf = models.TfidfModel(doc)
corpus = tfidf[doc]

In [37]:
NUM_TOPICS = 10
lda_model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary,
                                     passes=2, iterations=10, chunksize=350, eval_every=None, random_state=0)

In [38]:
from pprint import pprint

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.013*"의원" + 0.009*"고함" + 0.008*"권고" + 0.007*"목" + 0.006*"당부" + 0.006*"가운데" '
  '+ 0.005*"제재" + 0.005*"이름" + 0.005*"끝" + 0.004*"불성실"'),
 (1,
  '0.006*"불참" + 0.004*"공무원" + 0.003*"부인" + 0.002*"공소" + 0.001*"명예퇴직" + '
  '0.001*"울산" + 0.001*"중지" + 0.000*"국가" + 0.000*"정당법" + 0.000*"건설"'),
 (2,
  '0.019*"논지" + 0.014*"사유" + 0.013*"판단" + 0.013*"처분" + 0.011*"위법" + 0.011*"인정" '
  '+ 0.009*"진술" + 0.008*"법리" + 0.008*"오해" + 0.008*"기록"'),
 (3,
  '0.012*"해제" + 0.001*"비리" + 0.000*"감경" + 0.000*"직위" + 0.000*"라" + 0.000*"선우" '
  '+ 0.000*"본부" + 0.000*"평점" + 0.000*"최장" + 0.000*"순응"'),
 (4,
  '0.037*"일치" + 0.036*"관여" + 0.035*"의견" + 0.016*"우동" + 0.015*"윤영" + '
  '0.013*"김용준" + 0.012*"김상원" + 0.010*"이회" + 0.010*"윤" + 0.009*"배석"'),
 (5,
  '0.013*"위의" + 0.008*"임시" + 0.008*"사립학교법" + 0.008*"유예" + 0.008*"상해" + '
  '0.006*"조처" + 0.006*"총회" + 0.006*"가담" + 0.006*"재판" + 0.006*"징역"'),
 (6,
  '0.017*"배포" + 0.008*"약식" + 0.007*"명예" + 0.006*"인격" + 0.006*"위조" + 0.006*"대리" '
  '+ 0.005*"덕" + 0.004*"흔적" + 0.004*"훼손" + 

In [26]:
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

pyLDAvis.save_html(vis_data, "tfidf_kkma_10.html")

## Komoran

In [39]:
FEATURE_POS = ["NNG", "NNP", "VV", "VA","VX","MM","NP","NR"]
POS_KEY = "content_pos"

input_file_name = r"pos_komoran.txt"
documents = read_documents(input_file_name)

print(len(documents))

735


In [40]:
from gensim import corpora
from gensim import models

dictionary = corpora.Dictionary(documents)

n_items = len(dictionary)
doc = [dictionary.doc2bow(text) for text in documents]
tfidf = models.TfidfModel(doc)
corpus = tfidf[doc]

In [41]:
NUM_TOPICS = 10
lda_model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary,
                                     passes=2, iterations=10, chunksize=350, eval_every=None, random_state=0)

In [42]:
from pprint import pprint

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.001*"동해" + 0.000*"체육" + 0.000*"선원" + 0.000*"교정" + 0.000*"시력" + '
  '0.000*"지도자" + 0.000*"갱신" + 0.000*"좌" + 0.000*"계약" + 0.000*"기간"'),
 (1,
  '0.006*"협회" + 0.005*"보건" + 0.004*"반려" + 0.002*"중학교" + 0.002*"가처분" + '
  '0.002*"병원" + 0.002*"주임" + 0.002*"관광버스" + 0.001*"탈퇴" + 0.001*"배포"'),
 (2,
  '0.001*"교회" + 0.001*"빌딩" + 0.001*"승계" + 0.000*"해산" + 0.000*"삼미" + 0.000*"호텔" '
  '+ 0.000*"조교" + 0.000*"식품위생" + 0.000*"학사" + 0.000*"연구원"'),
 (3,
  '0.000*"운항" + 0.000*"반출" + 0.000*"주민" + 0.000*"자원봉사" + 0.000*"원료" + '
  '0.000*"덤프" + 0.000*"검문" + 0.000*"완성" + 0.000*"이현우" + 0.000*"기후"'),
 (4,
  '0.007*"경력" + 0.005*"이력서" + 0.005*"사칭" + 0.004*"교통사고" + 0.004*"입사" + '
  '0.004*"운전사" + 0.004*"사원" + 0.003*"운전" + 0.003*"사고" + 0.003*"학력"'),
 (5,
  '0.010*"신청인" + 0.001*"승계" + 0.001*"양수" + 0.001*"양도" + 0.001*"귀국" + '
  '0.001*"영업양도" + 0.001*"기숙사" + 0.001*"가처분" + 0.001*"전보발령" + 0.001*"감천"'),
 (6,
  '0.001*"기대권" + 0.000*"갱신" + 0.000*"가공" + 0.000*"재계약" + 0.000*"전적" + '
  '0.000*"산림조합" + 0.000*"단속" + 0.000*"

We selected Komoran because the tokenizing results are more detailed

In [43]:
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

pyLDAvis.save_html(vis_data, "tfidf_komoran_10.html")