# Method comparisson for classifying legal texts

Read more here:
https://github.com/kk7nc/Text_Classification#comparison-text-classification-algorithms

In [1]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
import pandas as pd
import numpy as np

## The sample dataset can be downloaded from: 
https://github.com/nlpvienna/Meetup/tree/master/01

In [3]:
df = pd.read_csv('sample_dataset.csv', sep='\t', encoding='utf-8')

In [4]:
df

Unnamed: 0,Class,Text
0,1,Aufgrund dieser Rechtsvorschriften ist von zwe...
1,0,Der Bodenaufbau in der Pflanzwanne setzte sich...
2,1,Maximal die Hälfte der Fertigstellungskosten z...
3,1,Alles was zum Thema Bauwerksbegrünung angebote...
4,1,Retentionsdächer werden in Deutschland immer h...
5,1,Dabei steht ihnen eine große Bandbreite an Mög...
6,0,BILDQUELLE: OPTIGRÜN \t Der gestalterischen Vi...
7,1,Abb. \t 3: Verbindliche Festlegung von Dachbeg...
8,0,Wolfgang Dickhaut und Dipl.-Geoökol. \t Michae...
9,1,Eine völlig neue Regelung zugunsten der Bauunt...


In [5]:
# One way to random sample the data
df.sample(frac=0.8, replace=True, random_state=1)

Unnamed: 0,Class,Text
37,0,Jahrhundert zum positiven verändern. \t Aufgru...
43,1,4.3 Absturzsicherung während der Ausführung Si...
12,0,Mit dem Neubau des Bundesministerium für Bildu...
8,0,Wolfgang Dickhaut und Dipl.-Geoökol. \t Michae...
9,1,Eine völlig neue Regelung zugunsten der Bauunt...
11,1,"Ziel ist es, den ursprünglichen Zustand des Na..."
5,1,Dabei steht ihnen eine große Bandbreite an Mög...
15,0,Soll der Pflegeaufwand weiterhin gering gehalt...
0,1,Aufgrund dieser Rechtsvorschriften ist von zwe...
16,1,Gebäudedaten Als zweiter Datensatz für die Inv...


In [6]:
# But we use this way:

msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [7]:
X_train = train['Text']
X_test = test['Text']
y_train = train['Class']
y_test = test['Class']

In [8]:
def fit(text_clf):
    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print(metrics.classification_report(y_test, predicted))

# Using Rocchio

In [9]:
rocchio = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', NearestCentroid()),
                     ])


In [10]:
fit(rocchio)

              precision    recall  f1-score   support

           0       0.67      0.33      0.44         6
           1       0.43      0.75      0.55         4

   micro avg       0.50      0.50      0.50        10
   macro avg       0.55      0.54      0.49        10
weighted avg       0.57      0.50      0.48        10



# Using Boosting

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

In [12]:
boosting = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', GradientBoostingClassifier(n_estimators=100)),
                     ])

fit(boosting)

              precision    recall  f1-score   support

           0       0.67      0.67      0.67         6
           1       0.50      0.50      0.50         4

   micro avg       0.60      0.60      0.60        10
   macro avg       0.58      0.58      0.58        10
weighted avg       0.60      0.60      0.60        10



# Using Bagging

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [14]:
bagging = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', BaggingClassifier(KNeighborsClassifier())),
                     ])
fit(bagging)

              precision    recall  f1-score   support

           0       0.60      0.50      0.55         6
           1       0.40      0.50      0.44         4

   micro avg       0.50      0.50      0.50        10
   macro avg       0.50      0.50      0.49        10
weighted avg       0.52      0.50      0.51        10



# Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
naive_bayes = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])
fit(naive_bayes)


              precision    recall  f1-score   support

           0       0.50      0.17      0.25         6
           1       0.38      0.75      0.50         4

   micro avg       0.40      0.40      0.40        10
   macro avg       0.44      0.46      0.38        10
weighted avg       0.45      0.40      0.35        10



# KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
knn = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])
fit(knn)

              precision    recall  f1-score   support

           0       0.60      0.50      0.55         6
           1       0.40      0.50      0.44         4

   micro avg       0.50      0.50      0.50        10
   macro avg       0.50      0.50      0.49        10
weighted avg       0.52      0.50      0.51        10



# SVM

In [19]:
from sklearn.svm import LinearSVC

In [20]:
svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                     ])
fit(svm)

              precision    recall  f1-score   support

           0       0.67      0.33      0.44         6
           1       0.43      0.75      0.55         4

   micro avg       0.50      0.50      0.50        10
   macro avg       0.55      0.54      0.49        10
weighted avg       0.57      0.50      0.48        10



# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rand_forest = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])
fit(rand_forest)

              precision    recall  f1-score   support

           0       0.80      0.67      0.73         6
           1       0.60      0.75      0.67         4

   micro avg       0.70      0.70      0.70        10
   macro avg       0.70      0.71      0.70        10
weighted avg       0.72      0.70      0.70        10



# Deep Learning (Keras)

In [23]:
# %conda install keras

In [24]:
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics

Using TensorFlow backend.


In [25]:
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

In [26]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer

    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [27]:
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 1)

model_DNN.fit(X_train_tfidf, list(y_train),
                              validation_data=(X_test_tfidf, list(y_test)),
                              epochs=10,
                              batch_size=128,
                              verbose=2)

predicted = model_DNN.predict(X_test_tfidf)

tf-idf with 920 features
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 40 samples, validate on 10 samples
Epoch 1/10
 - 1s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 2/10
 - 0s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 3/10
 - 0s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 4/10
 - 0s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 5/10
 - 0s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 6/10
 - 0s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 7/10
 - 0s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 8/10
 - 0s - loss: 7.5726 - acc: 0.5250 - val_loss: 9.5654 - val_acc: 0.4000
Epoch 9/10
 - 0s - loss: 7.5726 

In [28]:
print(metrics.classification_report(np.array(y_test), predicted))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.40      1.00      0.57         4

   micro avg       0.40      0.40      0.40        10
   macro avg       0.20      0.50      0.29        10
weighted avg       0.16      0.40      0.23        10



  'precision', 'predicted', average, warn_for)


## LSTM using FastText Vectors

In [29]:
from keras.layers import Dropout, Dense, GRU, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [30]:
from pyfasttext import FastText

In [31]:
ft = FastText()

Word vectors can be downloaded here: https://fasttext.cc/docs/en/crawl-vectors.html

In [32]:
ft.load_model('../models/cc.de.300.bin')

In [36]:
def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
#     f = open("glove.6B.50d.txt", encoding="utf8")
#     for line in f:

#         values = line.split()
#         word = values[0]
#         try:
#             coefs = np.asarray(values[1:], dtype='float32')
#         except:
#             pass
#         embeddings_index[word] = coefs
#     f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)

In [34]:
def Build_Model_RNN_Text(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """

    model = Sequential()
    hidden_layer = 3
    gru_node = 32

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))


    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(nclasses, activation='softmax'))


    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model