### Задание

Попробуйте поработать с датасетом юридических текстов. В датасете всего две важных колонки признаков: заголовок дела и его текст, а целевая переменная - case_outcome (мультиклассовая классификация). 

В базовом варианте можно оставить только текст дела, если хотите поинтереснее - можно попробовать распарсить case_title, добыв оттуда дополнительные признаки. 

https://www.kaggle.com/datasets/amohankumar/legal-text-classification-dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer #помогает одновременно обрабатывать и текствовые и нетекстовые признаки
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn import metrics

from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize
from nltk import ngrams
from string import punctuation
from nltk.corpus import stopwords

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv(r'D:\\datasets\\legal_text_classification.csv')
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [3]:
data.drop(['case_id'], inplace=True, axis=1)

In [8]:
data.sample(10)

Unnamed: 0,case_outcome,case_title,case_text
4479,referred to,Damir Travica v The Government of Croatia [200...,Court also notes the decision of the High Cour...
21803,discussed,Wati v Minister for Immigration and Multicultu...,The issue was touched on in Wati v Minister fo...
5964,cited,Dart Industries Inc v D&eacute;cor Corporation...,In Dart Industries Inc v D&eacute;cor Corporat...
3392,considered,"The ""Koursk"" [1924] P 140",As with the decision in Credit Lyonnais these ...
23649,referred to,Briginshaw v Briginshaw [1938] HCA 34 ; (1938)...,The Minister's argument conflated the weight w...
24360,discussed,Pavey &amp; Matthews Pty Ltd v Paul [1987] HCA...,In Pavey &amp; Matthews Pty Ltd v Paul [1987] ...
1213,cited,Minister for Immigration &amp; Multicultural A...,"First, in my view Mr Bickford correctly stated..."
20905,discussed,Steele v Deputy Commissioner of Taxation (1999...,There was no issue about s 8-1(1). It should b...
9581,distinguished,McKinnon v Commonwealth of Australia [1998] FC...,The Tribunal went on to say that that view was...
3834,cited,Hussain v Minister for Foreign Affairs [2008] ...,Where confidential but prejudicial information...


In [9]:
data.case_outcome.unique()

array(['cited', 'applied', 'followed', 'referred to', 'related',
       'considered', 'discussed', 'distinguished', 'affirmed', 'approved'],
      dtype=object)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_outcome  24985 non-null  object
 1   case_title    24985 non-null  object
 2   case_text     24809 non-null  object
dtypes: object(3)
memory usage: 585.7+ KB


In [5]:
data.drop('case_title', axis=1, inplace=True) #сначала решила попробовать только с текстами

KeyError: "['case_title'] not found in axis"

In [11]:
missing_values = data.isnull().sum().sort_values(ascending = False)
missing_values = missing_values[missing_values > 0] / data.shape[0]
print(f'{missing_values * 100} %')

case_text    0.704423
dtype: float64 %


In [6]:
data.dropna(inplace=True) #удалила строки, где текст отсутствовал, т.к. без него признаков не было вообще

In [11]:
data.head()

Unnamed: 0,case_outcome,case_text
0,cited,Ordinarily that discretion will be exercised s...
1,cited,The general principles governing the exercise ...
2,cited,Ordinarily that discretion will be exercised s...
3,cited,The general principles governing the exercise ...
4,cited,The preceding general principles inform the ex...


In [7]:
y = data['case_outcome']
X = data.drop('case_outcome', axis=1)

In [13]:
X.shape, y.shape

((24809, 1), (24809,))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
type(X_train)

pandas.core.frame.DataFrame

In [9]:

noise = stopwords.words('english') + list(punctuation)

In [10]:
vec = CountVectorizer(ngram_range=(2, 3), tokenizer=word_tokenize, stop_words=noise)
bow = vec.fit_transform(X_train['case_text'])



In [28]:
list(vec.vocabulary_.items())[:10]

[('court', 22702),
 ("'s", 2140),
 ('jurisdiction', 33803),
 ('hear', 30653),
 ('determine', 24236),
 ('native', 38742),
 ('title', 51626),
 ('applications', 16638),
 ('specifically', 48682),
 ('provided', 43466)]

In [17]:
#CountVectorizer(ngram_range=(2, 3)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(X_test['case_text']))
print(classification_report(pred, y_test))



               precision    recall  f1-score   support

     affirmed       0.15      0.50      0.23         8
      applied       0.26      0.41      0.32       382
     approved       0.20      0.40      0.27        15
        cited       0.88      0.64      0.74      4234
   considered       0.25      0.48      0.33       218
    discussed       0.22      0.53      0.31       116
distinguished       0.26      0.58      0.36        64
     followed       0.37      0.57      0.45       361
  referred to       0.47      0.63      0.53       791
      related       0.38      0.71      0.50        14

     accuracy                           0.61      6203
    macro avg       0.34      0.54      0.40      6203
 weighted avg       0.71      0.61      0.64      6203



In [18]:
vec = CountVectorizer(ngram_range=(3, 4), tokenizer=word_tokenize, stop_words=noise)
bow = vec.fit_transform(X_train['case_text'])
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(X_test['case_text']))
print(classification_report(pred, y_test))



               precision    recall  f1-score   support

     affirmed       0.11      0.43      0.18         7
      applied       0.22      0.46      0.29       281
     approved       0.20      0.38      0.26        16
        cited       0.90      0.61      0.73      4548
   considered       0.20      0.48      0.29       181
    discussed       0.21      0.57      0.31       106
distinguished       0.25      0.59      0.35        61
     followed       0.35      0.60      0.45       323
  referred to       0.42      0.67      0.51       670
      related       0.31      0.80      0.44        10

     accuracy                           0.60      6203
    macro avg       0.32      0.56      0.38      6203
 weighted avg       0.75      0.60      0.64      6203



In [39]:
#CountVectorizer(ngram_range=(1, 2))
pred = clf.predict(vec.transform(X_test['case_text']))
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.30      0.57      0.39        14
      applied       0.27      0.37      0.31       447
     approved       0.19      0.38      0.26        13
        cited       0.82      0.68      0.74      3616
   considered       0.30      0.44      0.35       291
    discussed       0.29      0.43      0.34       170
distinguished       0.22      0.51      0.31        68
     followed       0.35      0.51      0.41       381
  referred to       0.59      0.55      0.57      1190
      related       0.27      0.62      0.37        13

     accuracy                           0.60      6203
    macro avg       0.36      0.51      0.41      6203
 weighted avg       0.66      0.60      0.62      6203



In [44]:
vec = TfidfVectorizer(ngram_range=(1, 2)) # при ngram_range=(2, 3) результат был тот же
bow = vec.fit_transform(X_train['case_text'])
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(X_test['case_text']))
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.07      0.67      0.13         3
      applied       0.05      0.35      0.09        97
     approved       0.00      0.00      0.00         0
        cited       0.98      0.53      0.69      5527
   considered       0.04      0.44      0.08        41
    discussed       0.06      0.56      0.11        27
distinguished       0.01      1.00      0.02         2
     followed       0.10      0.66      0.18        87
  referred to       0.28      0.74      0.40       419
      related       0.00      0.00      0.00         0

     accuracy                           0.54      6203
    macro avg       0.16      0.49      0.17      6203
 weighted avg       0.89      0.54      0.64      6203



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Лучший результат получился при CountVectorizer(ngram_range=(2, 3). У BOW здесь впринципе результаты были лучше, чем у TF-IDF.
Не стала пробовать разные токенизаторы, по идее тексты официальные, необычных знаков препинания там быть не должно.

In [38]:
from gensim.models import Doc2Vec
import gensim
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
from tqdm import tqdm

In [49]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2: 
                continue
            tokens.append(word.lower())
    return tokens

train, test = train_test_split(data[['case_text', 'case_outcome']], test_size=0.3, random_state=42)


train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['case_text']), tags=[r.case_outcome]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['case_text']), tags=[r.case_outcome]), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Екатерина\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
import multiprocessing 
cores = multiprocessing.cpu_count()

In [51]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 17366/17366 [00:00<00:00, 730999.81it/s]


In [52]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 17366/17366 [00:00<00:00, 2361812.04it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1111356.17it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1111254.44it/s]
100%|██████████| 17366/17366 [00:00<00:00, 7260594.42it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1054267.44it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1047731.35it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1102374.36it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1052241.82it/s]
100%|██████████| 17366/17366 [00:00<00:00, 2590174.01it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1913574.07it/s]
100%|██████████| 17366/17366 [00:00<?, ?it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1056346.83it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1957755.23it/s]
100%|██████████| 17366/17366 [00:00<?, ?it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1024477.25it/s]
100%|██████████| 17366/17366 [00:00<00:00, 1051345.73it/s]
100%|██████████| 17366/17366 [00:00<?, ?it/s]
100%|██████████| 17366/17366 [00:00<

In [53]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

In [54]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(solver='liblinear', n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.4852881902458686
Testing F1 score: 0.32138532051644203


In [56]:
rf = RandomForestClassifier(n_jobs=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted'))) 

Testing accuracy 0.48649738008867394
Testing F1 score: 0.3213500925959548


In [57]:
bagging = BaggingClassifier(n_jobs=1)
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.4397420395002015
Testing F1 score: 0.3329450181681086


In [59]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted'))) #лучший результат для Doc2Vec, но он ниже, чем у BOW и TF-IDF

Testing accuracy 0.4882439876393927
Testing F1 score: 0.320353642878316


Хотела попробовать обучить с case title, но у меня ничего не получилось с Column Transformer, пробовала и взять функцию из семинара и вытаскивать все шаги из функций, но, видимо, все равно где-то запуталась(

In [30]:
data = pd.read_csv(r'D:\\datasets\\legal_text_classification.csv')
data.drop(['case_id'], inplace=True, axis=1)
data.head()

Unnamed: 0,case_outcome,case_title,case_text
0,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [14]:
data = data.fillna('--') #чтобы не удалять строки, где нет текста, но есть заголовок

In [7]:
missing_values = data.isnull().sum().sort_values(ascending = False)
missing_values = missing_values[missing_values > 0] / data.shape[0]
print(f'{missing_values * 100} %')

Series([], dtype: float64) %


In [31]:
y = data['case_outcome']
X = data.drop('case_outcome', axis=1)

In [17]:

text_features = ['case_title', 'case_text'] #это у меня функции раз 10 выдали ошибку, я сдалась и переписала все, но не помогло
text_transformer1 = TfidfVectorizer(ngram_range=(1,1), tokenizer=word_tokenize, stop_words=noise)
text_transformer2 = CountVectorizer(ngram_range=(1,1), tokenizer=word_tokenize, stop_words=noise)
preprocessor = ColumnTransformer([("txt", text_transformer1, text_features)])


In [18]:
processed_data = preprocessor.fit_transform(data)



In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:

clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(X_train, y_train)
pred = clf.predict(vec.transform(X_test['case_text']))
print(classification_report(pred, y_test))

ValueError: could not convert string to float: 'Clements v Independent Indigenous Advisory Committee [2003] FCAFC 143 ; (2003) 131 FCR 28'

In [32]:
def feature_engineering(choice_transformer, choice_ngrams):
    text_features = ['case_title', 'case_text']
    if choice_transformer == 'tfidf':
        text_transformer = TfidfVectorizer(ngram_range=choice_ngrams, tokenizer=word_tokenize, stop_words=noise)
    else:
        text_transformer = CountVectorizer(ngram_range=choice_ngrams, tokenizer=word_tokenize, stop_words=noise)

    preprocessor = ColumnTransformer(
        transformers=[("txt", text_transformer, text_features)
        ]
    )
    return preprocessor

In [33]:
def modelfit(model):
    model.fit(X_train, y_train)
    
    ypredtest = model.predict(X_test)
    ypredtrain = model.predict(X_train)
    
    print(accuracy_score(y_test, ypredtest), accuracy_score(y_train, ypredtrain))

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [35]:
preprocessor = feature_engineering('tfidf', (1, 1))

clfLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

clfSVC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", SVC())]
)

In [36]:
modelfit(clfLR)
modelfit(clfSVC)



ValueError: Found input variables with inconsistent numbers of samples: [2, 19988]