In [8]:
import re, os, glob
import pandas as pd
import numpy as np 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

In [18]:
def test_model_cv(Xtrain, Ytrain):
    vec3 = ('charvec', CountVectorizer(analyzer = 'char',ngram_range = (1,6), binary=True, max_df=0.7, lowercase=False))
    pipeline = Pipeline([vec3,
        ('classifier', LinearSVC(C=0.01))
                        ])
    print('fitting...')
    pipeline.fit(Xtrain, Ytrain)
    print('testing using cross-validation...')
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=3)
    evaluate(Ytrain, Ypredict)

In [19]:
def test_final_model(Xtrain, Ytrain, Xtest, Ytest):
    vec3 = ('charvec', CountVectorizer(analyzer = 'char',ngram_range = (1,6), binary=True, max_df=0.7, lowercase=False))
    pipeline = Pipeline([vec3,
        ('classifier', LinearSVC(C=0.01))
                        ])
    print('fitting...')
    pipeline.fit(Xtrain, Ytrain)
    print('testing on the test set...')
    Ypredict = pipeline.predict(Xtest)
    evaluate(Ytest, Ypredict)

In [20]:
def evaluate(Ytest, Ypredict):
        print("\nAccuracy: ", accuracy_score(Ytest, Ypredict), "\n")
        print("Classification report:\n\n", classification_report(Ytest, Ypredict))
        print("Confusion matrix:\n\n", confusion_matrix(Ytest, Ypredict), "\n")

In [21]:
train_set = pd.DataFrame(pd.read_csv('training_set_topic_identification', sep='|')).set_index('Zaaknummer (LJN/ECLI)')
test_set = pd.DataFrame(pd.read_csv('test_set_topic_identification', sep='|')).set_index('Zaaknummer (LJN/ECLI)')

In [22]:
Xtrain = train_set['extracted_text'].tolist()
Ytrain = train_set['is_eviction'].tolist()
Xtest = test_set['extracted_text'].tolist()
Ytest = test_set['is_eviction'].tolist()

In [23]:
test_model_cv(Xtrain, Ytrain)

fitting...
testing using cross-validation...

Accuracy:  0.8898809523809523 

Classification report:

               precision    recall  f1-score   support

       False       0.90      0.88      0.89       336
        True       0.88      0.90      0.89       336

    accuracy                           0.89       672
   macro avg       0.89      0.89      0.89       672
weighted avg       0.89      0.89      0.89       672

Confusion matrix:

 [[294  42]
 [ 32 304]] 



In [24]:
test_final_model(Xtrain, Ytrain, Xtest, Ytest)

fitting...
testing on the test set...

Accuracy:  0.88 

Classification report:

               precision    recall  f1-score   support

       False       0.94      0.81      0.87       100
        True       0.83      0.95      0.89       100

    accuracy                           0.88       200
   macro avg       0.89      0.88      0.88       200
weighted avg       0.89      0.88      0.88       200

Confusion matrix:

 [[81 19]
 [ 5 95]] 

