In [14]:
#import pandas for DataFrame processing and nltk for text normalization 
import pandas as pd 
import nltk

#load csv into DataFrame 
train = pd.read_csv('data/data.csv', header=None, names=['purpose', 'code'])

In [15]:
#check first records to see how data is shaped
train.head()

Unnamed: 0,purpose,code
0,Gegenstand des Unternehmens dienlich sein können.,1
1,Geschäftsgegenstand ist die Produktion pflanzl...,1
2,Gegenstand ist die Erbringung von Dienstleistu...,1
3,"Dienstleistungen aller Art, für die keine beso...",1
4,"Anbau von Obst und Gemüse, Vertrieb von selbst...",1


In [16]:
#display statistics of words count to check later progress after text normalization
train['word_count'] = train['purpose'].apply(lambda x: len(str(x).split(" ")))
train[['word_count']].describe()

Unnamed: 0,word_count
count,1117.0
mean,22.925694
std,21.508289
min,1.0
25%,11.0
50%,17.0
75%,28.0
max,382.0


In [17]:
#remove punctuation
train['purpose'] = train['purpose'].str.replace('[^\w\s]', '')

In [18]:
#lowercase
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [19]:
#download german language specifics stop words (common words that only complicate computation without bringing important information)
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('german')
stop

[nltk_data] Downloading package stopwords to /home/rzar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['aber',
 'alle',
 'allem',
 'allen',
 'aller',
 'alles',
 'als',
 'also',
 'am',
 'an',
 'ander',
 'andere',
 'anderem',
 'anderen',
 'anderer',
 'anderes',
 'anderm',
 'andern',
 'anderr',
 'anders',
 'auch',
 'auf',
 'aus',
 'bei',
 'bin',
 'bis',
 'bist',
 'da',
 'damit',
 'dann',
 'der',
 'den',
 'des',
 'dem',
 'die',
 'das',
 'daß',
 'derselbe',
 'derselben',
 'denselben',
 'desselben',
 'demselben',
 'dieselbe',
 'dieselben',
 'dasselbe',
 'dazu',
 'dein',
 'deine',
 'deinem',
 'deinen',
 'deiner',
 'deines',
 'denn',
 'derer',
 'dessen',
 'dich',
 'dir',
 'du',
 'dies',
 'diese',
 'diesem',
 'diesen',
 'dieser',
 'dieses',
 'doch',
 'dort',
 'durch',
 'ein',
 'eine',
 'einem',
 'einen',
 'einer',
 'eines',
 'einig',
 'einige',
 'einigem',
 'einigen',
 'einiger',
 'einiges',
 'einmal',
 'er',
 'ihn',
 'ihm',
 'es',
 'etwas',
 'euer',
 'eure',
 'eurem',
 'euren',
 'eurer',
 'eures',
 'für',
 'gegen',
 'gewesen',
 'hab',
 'habe',
 'haben',
 'hat',
 'hatte',
 'hatten',
 'hier',
 '

In [20]:

#remove stop words
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [21]:
#get frequent words which dont bring important information 
freq = pd.Series(' '.join(train['purpose']).split()).value_counts()[:1000]
freq

sowie                        589
betrieb                      308
handel                       279
vertrieb                     237
kies                         227
gegenstand                   222
sand                         205
gesellschaft                 194
art                          189
unternehmens                 179
landwirtschaftlichen         161
gewinnung                    161
insbesondere                 145
unternehmen                  138
produktion                   116
geschäftsgegenstand          109
erzeugung                    106
landwirtschaftlicher         101
herstellung                  101
dienstleistungen              98
durchführung                  97
geschäfte                     96
deren                         90
verarbeitung                  90
verkauf                       88
aufbereitung                  87
vermarktung                   82
baustoffen                    76
zusammenhang                  74
produkte                      68
          

In [22]:
#manually choose frequent words which could be removed as they are not related to topic classification
freqWords = ['sowie', 'insbesondere', 'unternehmen', 'gegenstand', 'tätigkeiten', 'deren', 'gmbh', 'beteiligen', 'einschließlich', 'firma', 'eigenen', 'zweigniederlassungen', 'sitz', 'berechtigt', 'sonstigen', 'ferner', 'zusammenhängenden', 'zwecke', 'zweck', 'co', 'kg', 'gesellschaftszweck', 'amtsgericht', 'organisation', 'soweit', 'b', 'a', 'geeignet', 'ähnlichen', 'tätigkeit', 'ähnlicher', 'gleicher']
#remove popular words
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in freqWords))

In [23]:
#remove numeric - in this case numbers also dont bring much information 
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if not x.isnumeric()))

In [24]:
#get also list of rare words that are to specific to use in model
freq = pd.Series(' '.join(train['purpose']).split()).value_counts()
rareWords = freq[freq < 6]
#remove rare words
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in rareWords))

In [25]:
#download german lemmatizer to simplify word vectors (normalize words to same form)
nltk.download('punkt')
from textblob_de.lemmatizers import PatternParserLemmatizer
_lemmatizer = PatternParserLemmatizer()
train['purpose'] = train['purpose'].apply(lambda x: " ".join([_lemmatizer.lemmatize(word)[0][0] for word in x.split()]))

[nltk_data] Downloading package punkt to /home/rzar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
#vestorize TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)  
X = tfidfconverter.fit_transform(train['purpose']).toarray()
y = train['code']

In [27]:
#split data to train and test set
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [28]:
#get RandomForestClassifier which is suitable for this kind of classification
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0)  
classifier.fit(X_train, y_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [29]:
#evaluate trained model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[ 92  13]
 [  5 114]]
             precision    recall  f1-score   support

          1       0.95      0.88      0.91       105
          8       0.90      0.96      0.93       119

avg / total       0.92      0.92      0.92       224

0.9196428571428571


In [30]:
import pickle
#export model (could be used for real time service)
with open('purposeToIndustryCodeLv1', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)

In [31]:
#import model
with open('purposeToIndustryCodeLv1', 'rb') as training_model:  
    model = pickle.load(training_model)

In [32]:
#compare predicted classes to real ones
y_pred2 = model.predict(X_test)
list(zip(list(y_test), y_pred2))

[(8, 8),
 (8, 8),
 (1, 1),
 (8, 8),
 (1, 1),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 8),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 8),
 (8, 1),
 (8, 8),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 8),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 8),
 (8, 8),
 (1, 1),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 8),
 (1, 8),
 (1, 1),
 (8, 8),
 (1, 1),
 (1, 1),
 (8, 8),
 (1, 8),
 (1, 1),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (8, 8),
 (8, 8),
 (1, 1),
 (8, 8),
 (1, 1),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 1),
 (1, 1),
 (8, 8),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 1),
 (8, 8),
 (8, 8),
 (8, 8),
 (8, 8),
 (8, 8),
 (1, 1),
 (1, 1),
 (1, 8),
 (8, 8),
 (1, 8),
 (1, 1),
 (1, 1),
 (8, 8),
 (8, 8),
 (1, 1),
 (8, 8),
 (8, 8),
 (1, 1),
 