# 법원 판결 예측
미국 대법원 사례의 사건의 식별자와 사건의 내용을 담은 데이터셋   
https://dacon.io/competitions/official/236112/overview/description   
참고자료: https://dacon.io/competitions/official/236112/codeshare/8472?page=2&dtype=recent

## 1. 데이터 구조 확인

In [None]:
import pandas as pd
import numpy as np

- ID : 사건 샘플 ID
- first_party : 사건의 첫 번째 당사자
- second_party : 사건의 두 번째 당사자
- facts : 사건 내용
- first_party_winner : 첫 번째 당사자의 승소 여부 (0 : 패배, 1 : 승리)

In [None]:
df_train = pd.read_csv("jud_train.csv")
test = pd.read_csv("jud_test.csv")
df_train.head()

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [None]:
len(df_train)

2478

In [None]:
df_train.ID.value_counts() # row마다 unique한 feature --> drop할 예정

TRAIN_0000    1
TRAIN_1654    1
TRAIN_1647    1
TRAIN_1648    1
TRAIN_1649    1
             ..
TRAIN_0827    1
TRAIN_0828    1
TRAIN_0829    1
TRAIN_0830    1
TRAIN_2477    1
Name: ID, Length: 2478, dtype: int64

In [None]:
df_train.first_party.value_counts()

United States       154
Illinois              9
Maryland              8
Florida               8
New York              7
                   ... 
David Carpenter       1
Larry Gene Heath      1
PGA TOUR, Inc.        1
PPL Montana, LLC      1
Markman               1
Name: first_party, Length: 2110, dtype: int64

In [None]:
df_train.second_party.value_counts()

United States                        240
California                            19
United States of America              15
Illinois                              13
Federal Communications Commission     10
                                    ... 
David Boren, Governor of Oklahoma      1
Federal Bureau of Prisons et al.       1
Town of Harrison                       1
Charles Burr et al.                    1
Westview Instruments, Inc.             1
Name: second_party, Length: 1974, dtype: int64

In [None]:
df_train.first_party_winner.value_counts()

1    1649
0     829
Name: first_party_winner, dtype: int64

In [None]:
df_train.facts[0] # 판례 내용(텍스트)

'On June 27, 1962, Phil St. Amant, a candidate for public office, made a television speech in Baton Rouge, Louisiana.  During this speech, St. Amant accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union.  Finally, St. Amant implicated Herman Thompson, an East Baton Rouge deputy sheriff, in a scheme to move money between the Teamsters Union and St. Amant’s political opponent. \nThompson successfully sued St. Amant for defamation.  Louisiana’s First Circuit Court of Appeals reversed, holding that Thompson did not show St. Amant acted with “malice.”  Thompson then appealed to the Supreme Court of Louisiana.  That court held that, although public figures forfeit some of their First Amendment protection from defamation, St. Amant accused Thompson of a crime with utter disregard of whether the remarks were true.  Finally, that court held that the First Amendment protects uninhibited, robust debate, rather t

In [None]:
test.facts[0]

'The 1984 Bail Reform Act allowed the federal courts to detain an arrestee prior to trial if the government could prove that the individual was potentially dangerous to other people in the community. Prosecutors alleged that Salerno and another person in this case were prominent figures in the La Cosa Nostra crime family.\n'

## 2. 판례 요약하기

In [None]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize # 문장 단위 토큰화
import nltk

nltk.data.path.append("nltk_data") # NLTK에서 데이터를 로드할 경로를 설정

def text_summarize(text):
    sentences = sent_tokenize(text) # 문장 단위로 쪼개서 리스트로 반환
    vectorizer = TfidfVectorizer()  # TF-IDF 벡터화
    sentence_vectors = vectorizer.fit_transform(sentences) # 문장들을 벡터로 변환

    top_n = 1
    # TF-IDF 가중치의 합을 기준으로 문장들을 정렬하고, 상위 1개의 문장의 인덱스를 추출
    top_sentence_indices = sentence_vectors.sum(axis=1).argsort()[-top_n:]
    # 상위 문장의 인덱스들을 정렬
    top_sentence_indices.sort()

    summary = " ".join(sentences[i.item()] for i in top_sentence_indices)

    return summary

df_train['facts'] = df_train['facts'].apply(lambda x: text_summarize(x))

In [None]:
df_train['facts'][0]

'On June 27, 1962, Phil St. Amant, a candidate for public office, made a television speech in Baton Rouge, Louisiana.'

In [None]:
test["facts"] = test["facts"].apply(lambda x: text_summarize(x))

In [None]:
test.facts[0]

'The 1984 Bail Reform Act allowed the federal courts to detain an arrestee prior to trial if the government could prove that the individual was potentially dangerous to other people in the community.'

## 3. 텍스트 추가 처리
부호제거, 불용어 제거, 중복 제거

In [None]:
import re

# 1) 부호 제거
def alpha_num(text):
    return re.sub(r"[^A-Za-z0-9]", "", text) # 알파벳이랑 숫자를 제외한 단어는 제외

# 2) 불용어 제거
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could",
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has",
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him",
             "himself",
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its",
             "itself",
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours",
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some",
             "such", "than", "that",
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
             "they'd", "they'll",
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we",
             "we'd", "we'll",
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who",
             "who's", "whom",
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself",
             "yourselves"]

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


# 3) 중복 단어 제거
def delete_duplication(text):
    words = text.split()
    unique_words = []

    for word in words:
        if word not in unique_words:
            unique_words.append(word)

    result = " ".join(unique_words)
    return result

In [None]:
df_train["facts"] = df_train["facts"].str.lower()
test["facts"] = test["facts"].str.lower()
df_train["facts"] = df_train["facts"].apply(alpha_num).apply(remove_stopwords).apply(delete_duplication)
test["facts"] = test["facts"].apply(alpha_num).apply(remove_stopwords).apply(delete_duplication)

In [None]:
df_train.facts[0]

'onjune271962philstamantacandidateforpublicofficemadeatelevisionspeechinbatonrougelouisiana'

In [None]:
test.facts[0]

'the1984bailreformactallowedthefederalcourtstodetainanarresteepriortotrialifthegovernmentcouldprovethattheindividualwaspotentiallydangeroustootherpeopleinthecommunity'

## 4. 모델 설계 및 학습

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor # Multi-Layer Perceptron

vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df["facts"])
    else:
        X_facts = vectorizer.transform(df["facts"])
    X_party1 = vectorizer.transform(df["first_party"])
    X_party2 = vectorizer.transform(df["second_party"])

    # 벡터화한 형태로 판례, 첫 번째 당사자, 두 번째 당사자로 재구성
    X = np.concatenate([X_party1.toarray(), X_party2.toarray(), X_facts.toarray()], axis=1)
    return X

X = get_vector(vectorizer, df_train, True)  # train_data
y = df_train["first_party_winner"]          # target

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = MLPRegressor(hidden_layer_sizes=300)  # 모델 설계
model.fit(X_train, y_train)                   # 학습

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_val)

In [None]:
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1.

In [None]:
y_pred.sum()

494.0

In [None]:
print("Accuracy:", accuracy_score(y_pred, y_val))

Accuracy: 0.6653225806451613


In [None]:
X_test = get_vector(vectorizer, test, False)
pred = model.predict(X_test)

submit = pd.read_csv("jud_sample_submission.csv")
submit["first_party_winner"] = pred
submit["first_party_winner"] = submit["first_party_winner"].apply(lambda x: 1 if x >= 0.5 else 0)

submit.to_csv("submit.csv", index=False)

In [None]:
submit.first_party_winner.value_counts()

1    1238
0       2
Name: first_party_winner, dtype: int64