In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
import pickle
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
x, y = np.load("data_x.npy"), np.load("data_y.npy")
y[np.where(y == 10)[0]] = 7
y[np.where(y == 8)[0]] = 7
x = x[np.where(y != 7)]
y = y[np.where(y != 7)]
y[np.where(y == 2)[0]] = 90
y[np.where(y == 3)[0]] = 2
y[np.where(y == 4)[0]] = 2
y[np.where(y == 5)[0]] = 2
y[np.where(y == 9)[0]] = 3
y[np.where(y == 90)[0]] = 3
y[np.where(y == 6)[0]] = 1

str_contain = np.vectorize(lambda x: "fahrrad" in x.lower())
idxs = np.intersect1d(np.where(str_contain(x))[0], np.where(y == 2)[0])
x = np.delete(x, idxs)
y = np.delete(y, idxs)

In [3]:
# automate Data cleaning
str_contain = np.vectorize(lambda x: "verkehrskontroll" in x.lower())
y[np.intersect1d(np.where(str_contain(x))[0], np.where(y == 3)[0])] = 1

str_contain = np.vectorize(lambda x: "eingebroch" in x.lower())
y[np.intersect1d(np.where(str_contain(x))[0], np.where(y == 1)[0])] = 2

str_contain = np.vectorize(lambda x: "alkohol" in x.lower())
# y[np.where(str_contain(x))[0]] = 4
idx = np.where(str_contain(x))
y = np.delete(y, idx)
x = np.delete(x, idx)

In [4]:
# remove contact data
remove_emails = np.vectorize(lambda x: re.sub("\S*@\S*\s?", "", re.sub("email:", "", re.sub("e-mail:", "", x.lower()))))
remove_telephones = np.vectorize(lambda x: re.sub("(\(?([\d \-\)\–\+\/\(]+)\)?([ .-–\/]?)([\d]+))", "", re.sub("tel.:", "", re.sub("telefon:", "", x.lower()))))
remove_links = np.vectorize(lambda x: re.sub("http://", "", re.sub("https://", "", re.sub(r'www.[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x.lower()))))
str_contain = np.vectorize(lambda x: "email:" in x.lower())
x = remove_links(remove_telephones(remove_emails(x)))

In [5]:
def balance_data_helper(x, y, i, n):
    """
        n - int: number to reduce to
        i - int: number which should be reduced
    """
    idxs = np.where(y == i)[0]
    np.random.shuffle(idxs)
    idxs = idxs[:(len(idxs)-n)]
    x = np.delete(x, idxs)
    y = np.delete(y, idxs)
    return x, y

x, y = balance_data_helper(x, y, 1, 1400)
x, y = balance_data_helper(x, y, 2, 1400)

for i in range(1, 5):
    print(i, ": ", len(np.where(y == i)[0]))

1 :  1400
2 :  1400
3 :  835
4 :  0


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [7]:
print(f"N_traing: {len(x_train)} \nN_testing: {len(x_test)}")

N_traing: 2544 
N_testing: 1091


In [8]:
stemmer = SnowballStemmer("german", ignore_stopwords=True)

analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

In [9]:
clf_pipeline = Pipeline([
    ("vect", CountVectorizer(stop_words=set(stopwords.words("german")))),
    ("tfidf", TfidfTransformer()),
 #   ("scale", StandardScaler(with_mean=False)),
    ("clf", SGDClassifier()),
])

In [20]:
parameters = {
    'clf__alpha': [1e-5], # learning rate
    'clf__max_iter': [100], # number of epochs
    'clf__loss': ["log"], 
    'clf__penalty': ['l2'],
    "vect__ngram_range": [(1, 4)],
    "vect__analyzer": [stemmed_words],
}

In [21]:
gs_clf = GridSearchCV(clf_pipeline, parameters, cv=3, n_jobs=1, iid=False)
clf = gs_clf.fit(x_train, y_train)
print("Train acc:", accuracy_score(y_train, clf.predict(x_train)))
print("Test acc:", accuracy_score(y_test, clf.predict(x_test)))



Train acc: 0.9937106918238994
Test acc: 0.8606782768102658


In [22]:
confusion_matrix(y_test, clf.predict(x_test))

array([[348,  53,   7],
       [ 45, 373,   9],
       [  5,  33, 218]])

In [16]:
pickle.dump(clf, open("simple-classifier-website/model-9.sav", "wb"))

In [14]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-30:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))

In [23]:
print_top10(clf.best_estimator_.named_steps["vect"], clf.best_estimator_.named_steps["clf"], ["1", "2", "3"])

1: zeitpunkt erfasst contain war anhang unberechtigt rauchentwickl montagvormittag losch bodenfeld brilon bargeld ist brennend geloscht ein unfall vermut schad erganz gegen ursach brandursach feu brandstift flamm brannt feuerwehr brand verkehrskontroll
2: zwei kennzeich wohnhaft zeit ruckt korperverletz link gesichert einkauf andere begab sie aufgebroch supermarkt tat baustell mark erbeutet geldbors zwischen donnerstag diebstahl guterbahnhof wollte soll entwendet wert tur eingebroch unbekannt
3: entdeckt gegenstand mutwill granat hausfriedensbruch polizeibeamt bech schuss gramm drogenschnellt einfluss reagiert joint auch sichergestellt beschlagnahmt mordkommission staatsanwaltschaft salzlandkreis positiv lautereck schonebeck marihuana vandalismus kontroll verstoss aussenspiegel beamt totungsdelikt kontrolliert
