In [13]:
############################################################
# Training Classifier with vectorized data and running tests
############################################################

In [None]:
# restarting the kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [1]:
# optionally run preprocessing again
#%run ./import_and_preprocess.ipynb
# optionally run vectorization again
#%run ./vectorize.ipynb

In [2]:
# imports
from ClassificationInterpreter import ClassificationInterpreter
from Storage import Storage
from SessionConfigReader import SessionConfigReader
from Classifier import Classifier

Using TensorFlow backend.


In [3]:
vec_model_id = SessionConfigReader.read_value('vec_model_id')
nn_model_id = SessionConfigReader.read_value('keras_nn_model_id')
print(vec_model_id)
print(nn_model_id)
train_vec_id = 'articles_train_vectorized'
test_vec_id = 'articles_test_vectorized'

articles_train_word2vec_001
articles_train_keras_nn_001


In [4]:
Storage.delete_h5_model(nn_model_id)

In [5]:
train_vec = Storage.load_pd_frame(train_vec_id)
test_vec = Storage.load_pd_frame(test_vec_id)

In [6]:
train_vec_cats_out = ClassificationInterpreter.create_out_vectors(train_vec)
train_vec_cats_out = train_vec_cats_out.drop(columns=['noise removed', 'stopwords removed', 'preprocessed'])
train_vec_cats_out

Unnamed: 0,categories,text,document vector,category vector
9254,[Wirtschaft],Der amerikanische Rohstoffriese Samson Resourc...,"[0.2490101255869533, -0.19364042608668247, -0....","[0.06160935014486313, -0.17021781206130981, -0..."
3584,[Kultur],"Wenn man den Menschen in Glück ersäuft, dann f...","[0.2775443969423779, -0.13158075775977018, -0....","[0.07217168062925339, -0.10056214779615402, -0..."
7766,[Web],"Nächster Fall von Hassposting, der zu Jobverlu...","[0.31568419103045015, -0.13366419196361676, -0...","[0.06477979570627213, -0.042159389704465866, -..."
9632,[Wirtschaft],"Spaniens Energieriesen verfeuern Braunkohle, u...","[0.4154998466249678, -0.13577979533155296, -0....","[0.06160935014486313, -0.17021781206130981, -0..."
8732,[Wirtschaft],Ursache muss noch untersucht werden. Wien – Ku...,"[0.2284945207467335, -0.0821207987663775, -0.0...","[0.06160935014486313, -0.17021781206130981, -0..."
...,...,...,...,...
3029,[International],74-Jährigem drohten 350 Peitschenhiebe. London...,"[0.4176037944311839, -0.10229097418676586, -0....","[0.11131033301353455, -0.03238455578684807, 0...."
711,[Inland],Bernd Saurer war Bridge-Juniorenweltmeister un...,"[0.2277167585450522, 0.06321682860134194, -0.0...","[0.03687043488025665, -0.02509034238755703, -0..."
5772,[Sport],Finalfluch blieb Klopp treu – Sevilla unterstr...,"[0.434561253709641, -0.12695886906052087, -0.1...","[0.15435032546520233, -0.027082398533821106, -..."
7926,[Web],Außenamt: Beurteilung durch UN-Arbeitsgruppe e...,"[0.2580930808132204, -0.09712476860901613, -0....","[0.06477979570627213, -0.042159389704465866, -..."


In [7]:
test_vec = test_vec.drop(columns=['noise removed', 'stopwords removed', 'preprocessed'])
test_vec

Unnamed: 0,categories,text,document vector
1,[Etat],App sei nicht so angenommen worden wie geplant...,"[0.39530070764677866, -0.23964616719500295, -0..."
2,[Etat],"Zum Welttag der Suizidprävention ist es Zeit, ...","[0.3255458307990228, -0.18829256407791814, -0...."
3,[Etat],Mitarbeiter überreichten Eigentümervertretern ...,"[0.30378349925959686, -0.04361086992812293, -0..."
4,[Etat],Service: Jobwechsel in der Kommunikationsbranc...,"[0.38095428341199666, -0.35288267811306584, 0...."
5,[Etat],Was Sie über diese Woche wissen sollten - und ...,"[0.3160305998365705, -0.24576140217201706, -0...."
...,...,...,...
10262,[Wissenschaft],"Archäologin: ""Einige Monumente wie der Torboge...","[0.2687826882873196, -0.14626336341170826, -0...."
10263,[Wissenschaft],800 Wissenschafter zu großer Konferenz in Wien...,"[0.23208048956297422, -0.11530571520039491, -0..."
10264,[Wissenschaft],Vor seinem Untergang befand sich das Schiff de...,"[0.33146269986944565, -0.1507061942444037, -0...."
10270,[Wissenschaft],Die zentrale Frage des Projekts: Siedelten Ägy...,"[0.25973136383626194, -0.19709823316338265, -0..."


In [8]:
# Training...
new_model_id = Classifier.create_model(train_vec_cats_out, fv_col_name='document vector', cat_v_col_name='category vector')
new_model_id

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'articles_train_keras_nn_001'

In [9]:
# Testing...
classified_df = Classifier.classify(test_vec)
classified_df

Unnamed: 0,categories,text,document vector,classification output
1,[Etat],App sei nicht so angenommen worden wie geplant...,"[0.39530070764677866, -0.23964616719500295, -0...","[0.72825664, -0.5306321, -0.44767562, 0.221063..."
2,[Etat],"Zum Welttag der Suizidprävention ist es Zeit, ...","[0.3255458307990228, -0.18829256407791814, -0....","[0.6071052, -0.35649732, -0.47923097, 0.191998..."
3,[Etat],Mitarbeiter überreichten Eigentümervertretern ...,"[0.30378349925959686, -0.04361086992812293, -0...","[0.5708936, -0.52247375, -0.22104694, 0.161862..."
4,[Etat],Service: Jobwechsel in der Kommunikationsbranc...,"[0.38095428341199666, -0.35288267811306584, 0....","[0.63376707, -0.2259583, -0.305589, 0.14300132..."
5,[Etat],Was Sie über diese Woche wissen sollten - und ...,"[0.3160305998365705, -0.24576140217201706, -0....","[0.60987407, -0.2944886, -0.3737792, 0.1709261..."
...,...,...,...,...
10262,[Wissenschaft],"Archäologin: ""Einige Monumente wie der Torboge...","[0.2687826882873196, -0.14626336341170826, -0....","[0.5850443, -0.32913068, -0.19824201, 0.116589..."
10263,[Wissenschaft],800 Wissenschafter zu großer Konferenz in Wien...,"[0.23208048956297422, -0.11530571520039491, -0...","[0.5196834, -0.46166557, -0.21491116, 0.151489..."
10264,[Wissenschaft],Vor seinem Untergang befand sich das Schiff de...,"[0.33146269986944565, -0.1507061942444037, -0....","[0.5928418, -0.43080676, -0.31899327, 0.161226..."
10270,[Wissenschaft],Die zentrale Frage des Projekts: Siedelten Ägy...,"[0.25973136383626194, -0.19709823316338265, -0...","[0.52527314, -0.46704274, -0.32158172, 0.18057..."


In [10]:
classified_df.at[1,'classification output']

array([ 0.72825664, -0.5306321 , -0.44767562,  0.22106388,  0.6971546 ,
        1.6912975 , -2.143536  ,  1.1951426 ,  2.2588098 , -0.2739616 ,
        2.4158366 ,  0.5045114 ,  0.42251217,  2.7250733 , -0.7161485 ,
       -1.5689476 , -2.2428613 , -0.61777395, -2.3512802 , -1.2071196 ,
       -1.7527621 , -0.18494083,  2.7046876 ,  0.88601404,  0.5829447 ,
       -0.1276708 , -0.23302542,  2.758905  , -0.43409356, -1.259242  ,
       -2.322217  ,  0.42608008,  0.42901388, -0.49570155,  0.14628167,
       -0.5516    , -0.6553709 ,  0.83125997, -0.7764736 ,  0.46063232,
       -1.726793  ,  1.2138308 ,  2.6944199 , -2.5396476 ,  0.46236134,
       -0.6323517 , -0.21634412,  0.11897618,  0.09264544,  0.6715116 ,
        1.2304821 ,  0.83620214,  0.0850469 , -1.8952519 , -1.4553187 ,
        1.3586793 ,  0.47190526,  0.5688002 , -0.03037878,  0.2677452 ,
       -2.00471   ,  0.90965444, -1.2455941 , -1.490379  ,  1.5964059 ,
        0.7610609 ,  1.9451611 , -1.7554919 ,  0.9693461 ,  2.00

In [11]:
interpreted = ClassificationInterpreter.interpret_output(classified_df)
interpreted

Unnamed: 0,categories,text,document vector,classification output,result
1,[Etat],App sei nicht so angenommen worden wie geplant...,"[0.39530070764677866, -0.23964616719500295, -0...","[0.72825664, -0.5306321, -0.44767562, 0.221063...",[Web]
2,[Etat],"Zum Welttag der Suizidprävention ist es Zeit, ...","[0.3255458307990228, -0.18829256407791814, -0....","[0.6071052, -0.35649732, -0.47923097, 0.191998...",[Panorama]
3,[Etat],Mitarbeiter überreichten Eigentümervertretern ...,"[0.30378349925959686, -0.04361086992812293, -0...","[0.5708936, -0.52247375, -0.22104694, 0.161862...",[Inland]
4,[Etat],Service: Jobwechsel in der Kommunikationsbranc...,"[0.38095428341199666, -0.35288267811306584, 0....","[0.63376707, -0.2259583, -0.305589, 0.14300132...",[Panorama]
5,[Etat],Was Sie über diese Woche wissen sollten - und ...,"[0.3160305998365705, -0.24576140217201706, -0....","[0.60987407, -0.2944886, -0.3737792, 0.1709261...",[Panorama]
...,...,...,...,...,...
10262,[Wissenschaft],"Archäologin: ""Einige Monumente wie der Torboge...","[0.2687826882873196, -0.14626336341170826, -0....","[0.5850443, -0.32913068, -0.19824201, 0.116589...",[International]
10263,[Wissenschaft],800 Wissenschafter zu großer Konferenz in Wien...,"[0.23208048956297422, -0.11530571520039491, -0...","[0.5196834, -0.46166557, -0.21491116, 0.151489...",[Inland]
10264,[Wissenschaft],Vor seinem Untergang befand sich das Schiff de...,"[0.33146269986944565, -0.1507061942444037, -0....","[0.5928418, -0.43080676, -0.31899327, 0.161226...",[Web]
10270,[Wissenschaft],Die zentrale Frage des Projekts: Siedelten Ägy...,"[0.25973136383626194, -0.19709823316338265, -0...","[0.52527314, -0.46704274, -0.32158172, 0.18057...",[Inland]


In [12]:
ClassificationInterpreter.evaluate_output(interpreted)

0.4128868989682694