# A simple textual classifier for Dutch citizen reports
Example of a simple textual classification using TF-IDF and SGD. Optimal hyperparameters for the dataset are found using a gridsearch.

An example dataset of dutch citizen reports is added for demonstration purposes.

In [7]:
import pandas as pd

df = pd.read_csv('voorbeeld_meldingen.csv')
print(len(df),'meldingen ingeladen')

texts = df['Tekst']
labels = df['Label']

split = 0.5
splitpoint = int(split*len(texts))


# train data
train_texts = texts[:splitpoint]
train_labels = labels[:splitpoint]

# test data
test_texts = texts[splitpoint:]
test_labels = labels[splitpoint:]

97 meldingen ingeladen


In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

# pipeline of classifier
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', SGDClassifier()),
])

# possible parameters to do gridsearch on
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000,),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80,100,150),
    'clf__loss':('log',)
}

grid_search = GridSearchCV(pipeline, parameters)
grid_search.fit(train_texts, train_labels)

print('Best parameters: ')
print(grid_search.best_params_)
print('')

print('Best score: ')
print(grid_search.best_score_)
print('')

Best parameters: 
{'clf__alpha': 1e-05, 'clf__loss': 'log', 'clf__max_iter': 10, 'clf__penalty': 'elasticnet', 'vect__max_df': 0.5, 'vect__max_features': None, 'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__use_idf': True}

Best score: 
0.75



# Model persistence
http://scikit-learn.org/stable/modules/model_persistence.html

In [10]:
from sklearn.externals import joblib
joblib.dump(grid_search, 'model.pkl') 

model = joblib.load('model.pkl') 

# Evaluation

In [11]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

test_predict = model.predict(test_texts)

precision = str(round(precision_score(test_labels, test_predict, average='macro'),2))
recall = str(round(recall_score(test_labels, test_predict, average='macro'),2))
accuracy = str(round(accuracy_score(test_labels, test_predict),2))

print('Precision',precision )
print('Recall',recall )
print('Accuracy',accuracy )

Precision 0.18
Recall 0.24
Accuracy 0.71


  'precision', 'predicted', average, warn_for)
