# Industry codes classification for company descriptions with self trained model

In [1]:
#import pandas for DataFrame processing and nltk for text normalization 
import pandas as pd 
import nltk

#load csv into DataFrame 
train = pd.read_csv('data/data.csv', header=None, names=['purpose', 'code'])

In [2]:
#check first records to see how data is shaped
train.head()

Unnamed: 0,purpose,code
0,Gegenstand des Unternehmens dienlich sein können.,1
1,Geschäftsgegenstand ist die Produktion pflanzl...,1
2,Gegenstand ist die Erbringung von Dienstleistu...,1
3,"Dienstleistungen aller Art, für die keine beso...",1
4,"Anbau von Obst und Gemüse, Vertrieb von selbst...",1


In [3]:
#display statistics of words count to check later progress after text normalization
train['word_count'] = train['purpose'].apply(lambda x: len(str(x).split(" ")))
train[['word_count']].describe()

Unnamed: 0,word_count
count,1117.0
mean,22.925694
std,21.508289
min,1.0
25%,11.0
50%,17.0
75%,28.0
max,382.0


In [4]:
#remove punctuation
train['purpose'] = train['purpose'].str.replace('[^\w\s]', '')

  train['purpose'] = train['purpose'].str.replace('[^\w\s]', '')


In [5]:
#lowercase
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [6]:
#download german language specifics stop words (common words that only complicate computation without bringing important information)
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('german')
stop

[nltk_data] Downloading package stopwords to /home/michal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['aber',
 'alle',
 'allem',
 'allen',
 'aller',
 'alles',
 'als',
 'also',
 'am',
 'an',
 'ander',
 'andere',
 'anderem',
 'anderen',
 'anderer',
 'anderes',
 'anderm',
 'andern',
 'anderr',
 'anders',
 'auch',
 'auf',
 'aus',
 'bei',
 'bin',
 'bis',
 'bist',
 'da',
 'damit',
 'dann',
 'der',
 'den',
 'des',
 'dem',
 'die',
 'das',
 'dass',
 'daß',
 'derselbe',
 'derselben',
 'denselben',
 'desselben',
 'demselben',
 'dieselbe',
 'dieselben',
 'dasselbe',
 'dazu',
 'dein',
 'deine',
 'deinem',
 'deinen',
 'deiner',
 'deines',
 'denn',
 'derer',
 'dessen',
 'dich',
 'dir',
 'du',
 'dies',
 'diese',
 'diesem',
 'diesen',
 'dieser',
 'dieses',
 'doch',
 'dort',
 'durch',
 'ein',
 'eine',
 'einem',
 'einen',
 'einer',
 'eines',
 'einig',
 'einige',
 'einigem',
 'einigen',
 'einiger',
 'einiges',
 'einmal',
 'er',
 'ihn',
 'ihm',
 'es',
 'etwas',
 'euer',
 'eure',
 'eurem',
 'euren',
 'eurer',
 'eures',
 'für',
 'gegen',
 'gewesen',
 'hab',
 'habe',
 'haben',
 'hat',
 'hatte',
 'hatten',
 '

In [7]:

#remove stop words
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [8]:
#get frequent words which dont bring important information 
freq = pd.Series(' '.join(train['purpose']).split()).value_counts()[:1000]
freq

sowie             588
betrieb           303
handel            268
vertrieb          220
gegenstand        213
                 ... 
bauelementen        2
materialien.        2
erdstoffen,         2
-aufbereitung,      2
möglich.            2
Name: count, Length: 1000, dtype: int64

In [9]:
#manually choose frequent words which could be removed as they are not related to topic classification
freqWords = ['sowie', 'insbesondere', 'unternehmen', 'gegenstand', 'tätigkeiten', 'deren', 'gmbh', 'beteiligen', 'einschließlich', 'firma', 'eigenen', 'zweigniederlassungen', 'sitz', 'berechtigt', 'sonstigen', 'ferner', 'zusammenhängenden', 'zwecke', 'zweck', 'co', 'kg', 'gesellschaftszweck', 'amtsgericht', 'organisation', 'soweit', 'b', 'a', 'geeignet', 'ähnlichen', 'tätigkeit', 'ähnlicher', 'gleicher']
#remove popular words
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in freqWords))

In [10]:
#remove numeric - in this case numbers also dont bring much information 
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if not x.isnumeric()))

In [11]:
#get also list of rare words that are to specific to use in model
freq = pd.Series(' '.join(train['purpose']).split()).value_counts()
rareWords = freq[freq < 6]
#remove rare words
train['purpose'] = train['purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in rareWords))

In [12]:
#vestorize TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)  
X = tfidfconverter.fit_transform(train['purpose']).toarray()
y = train['code']

In [13]:
#split data to train and test set
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [14]:
#get RandomForestClassifier which is suitable for this kind of classification
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0)  
classifier.fit(X_train, y_train)  

In [15]:
#evaluate trained model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[ 87  18]
 [  7 112]]
              precision    recall  f1-score   support

           1       0.93      0.83      0.87       105
           2       0.86      0.94      0.90       119

    accuracy                           0.89       224
   macro avg       0.89      0.88      0.89       224
weighted avg       0.89      0.89      0.89       224

0.8883928571428571


In [16]:
import pickle
#export model (could be used for real time service)
with open('purposeToIndustryCodeLv1', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)

In [17]:
#import model
with open('purposeToIndustryCodeLv1', 'rb') as training_model:  
    model = pickle.load(training_model)

In [18]:
#compare predicted classes to real ones
y_pred2 = model.predict(X_test)
list(zip(list(y_test), y_pred2))

[(2, 2),
 (2, 2),
 (1, 1),
 (2, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (2, 1),
 (2, 1),
 (1, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 2),
 (2, 2),
 (1, 1),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (1, 2),
 (1, 1),
 (2, 2),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 2),
 (1, 1),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (2, 2),
 (2, 2),
 (1, 1),
 (2, 2),
 (1, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 2),
 (1, 1),
 (2, 2),
 (1, 1),
 (1, 2),
 (2, 2),
 (2, 2),
 (2, 1),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 (1, 1),
 (1, 2),
 (2, 2),
 (1, 2),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (1, 2),
 (2, 2),
 (2, 2),
 (1, 1),
 