In [1]:
from text_processing import *
from sklearn.metrics import precision_recall_fscore_support
from constants import *
from functools import reduce

In [2]:
file_selector = is_2017_in_file

In [3]:
judgements_files = filter(file_selector, os.listdir(json_data_dir))
texts = chain.from_iterable(
    map(lambda f: judgements_raw(f, json_data_dir), judgements_files))
texts = islice(texts, 3000)
filtered = filter(filter_judgements, texts)

In [4]:
with_categories = map(lambda j: WithCategory(j, map_category(j).label), filtered)
cleaned = map(
    lambda j: Line(j.judgement['id'], clean_text(j.judgement['textContent'], common_words), j.category),
    with_categories)
cleaned = list(cleaned)


### Nr of words in raw dataset

In [5]:
reduce(lambda x, y: x + y, map(lambda l: len(l.words.split()), cleaned))

5590549

In [6]:
tuples = map(lambda l: (l.words, l.category), cleaned)
listed = list(filter(lambda tup: tup[1]  in ['civil', 'criminal', 'economic', 'insurance'], tuples))

x_train, x_test, y_train, y_test = get_train_test_data(listed)

In [7]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=common_words)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])

pipeline.fit(x_train, y_train)
predictions = pipeline.predict(x_test)

print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.92      0.96      0.94       251
          1       1.00      0.99      1.00       119
          2       0.84      0.43      0.57        49
          3       1.00      0.99      1.00       112

avg / total       0.95      0.92      0.93       531



In [8]:
precision_recall_fscore_support(y_test, predictions, average='micro')

(0.9497098646034816, 0.9246704331450094, 0.9370229007633587, None)

In [9]:
precision_recall_fscore_support(y_test, predictions, average='macro')

(0.9390874524714828, 0.84284971458703, 0.8741517610034026, None)

In [10]:
cat_id = dict(map(lambda line: (line.id, line.category), cleaned))
tagged_categories = list(tagged(cat_id))

In [11]:
len(tagged_categories)

2440

In [12]:
tuples = map(lambda l: (l.words, l.category), tagged_categories)
listed = list(
    filter(
        lambda tup: tup[1] in ['civil', 'criminal', 'economic', 'insurance'],
        tuples))

In [13]:
reduce(lambda x, y: x + y,
       map(lambda l: len(l.words.split()), tagged_categories))

5545100

In [14]:
x_train, x_test, y_train, y_test = get_train_test_data(listed)

In [15]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=common_words)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])

pipeline.fit(x_train, y_train)
predictions = pipeline.predict(x_test)

print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.86      0.98      0.91       251
          1       1.00      0.97      0.98       119
          2       1.00      0.12      0.22        49
          3       0.99      0.98      0.99       112

avg / total       0.93      0.90      0.88       531



In [16]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=common_tagged_words)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])

pipeline.fit(x_train, y_train)
predictions = pipeline.predict(x_test)

print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.86      0.98      0.91       251
          1       1.00      0.97      0.98       119
          2       1.00      0.12      0.22        49
          3       0.99      0.98      0.99       112

avg / total       0.93      0.90      0.88       531



In [17]:
precision_recall_fscore_support(y_test, predictions, average='micro')

(0.9206963249516441, 0.896421845574388, 0.9083969465648855, None)

In [18]:
precision_recall_fscore_support(y_test, predictions, average='macro')

(0.9626600284495022, 0.7617685022216059, 0.7754534976918017, None)