In [17]:
############################################################
# Training Classifier with vectorized data and running tests
############################################################

In [None]:
# restarting the kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [1]:
# optionally run preprocessing again
#%run ./import_and_preprocess.ipynb
# optionally run vectorization again
#%run ./vectorize.ipynb

In [2]:
# imports
from ClassificationInterpreter import ClassificationInterpreter
from Storage import Storage
from SessionConfigReader import SessionConfigReader
from Classifier import Classifier
from EvaluationHandler import EvaluationHandler

Using TensorFlow backend.


In [3]:
vec_model_id = SessionConfigReader.read_value('vec_model_id')
nn_model_id = SessionConfigReader.read_value('keras_nn_model_id')
print(vec_model_id)
print(nn_model_id)
train_vec_id = 'articles_train_vectorized'
test_vec_id = 'articles_test_vectorized'

articles_train_word2vec_001
articles_train_keras_nn_001


In [4]:
Storage.delete_h5_model(nn_model_id)

In [5]:
train_vec = Storage.load_pd_frame(train_vec_id)
test_vec = Storage.load_pd_frame(test_vec_id)

In [6]:
train_vec_cats_out = ClassificationInterpreter.create_out_vectors(train_vec)
train_vec_cats_out = train_vec_cats_out.drop(columns=['noise removed', 'stopwords removed', 'preprocessed'])
train_vec_cats_out

Unnamed: 0,categories,text,document vector,category vector
9254,[Wirtschaft],Der amerikanische Rohstoffriese Samson Resourc...,"[0.34127889108663845, 0.4478745025842981, 0.13...","[0.2768802046775818, 0.18851660192012787, 0.21..."
3584,[Kultur],"Wenn man den Menschen in Glück ersäuft, dann f...","[0.07081660233719352, 0.2716327164024632, 0.10...","[0.06712189316749573, 0.020851198583841324, 0...."
7766,[Web],"Nächster Fall von Hassposting, der zu Jobverlu...","[0.13631616547936573, 0.2503662471100688, 0.16...","[0.06860899925231934, 0.08835186809301376, 0.0..."
9632,[Wirtschaft],"Spaniens Energieriesen verfeuern Braunkohle, u...","[0.1272728861933881, 0.5433052562504679, 0.035...","[0.2768802046775818, 0.18851660192012787, 0.21..."
8732,[Wirtschaft],Ursache muss noch untersucht werden. Wien – Ku...,"[0.2076187159159574, 0.16771606975181522, 0.21...","[0.2768802046775818, 0.18851660192012787, 0.21..."
...,...,...,...,...
3029,[International],74-Jährigem drohten 350 Peitschenhiebe. London...,"[0.2738984087752828, 0.49909413738046277, 0.12...","[0.1570603847503662, 0.13728342950344086, 0.16..."
711,[Inland],Bernd Saurer war Bridge-Juniorenweltmeister un...,"[0.11189016641335602, 0.10600653727677592, 0.3...","[0.03608638793230057, 0.03688330575823784, 0.0..."
5772,[Sport],Finalfluch blieb Klopp treu – Sevilla unterstr...,"[-0.0007492994651935684, 0.4518081661735778, 0...","[0.030196834355592728, 0.09324131160974503, 0...."
7926,[Web],Außenamt: Beurteilung durch UN-Arbeitsgruppe e...,"[0.20784718948804462, 0.36074602253211197, 0.2...","[0.06860899925231934, 0.08835186809301376, 0.0..."


In [7]:
test_vec = test_vec.drop(columns=['noise removed', 'stopwords removed', 'preprocessed'])
test_vec

Unnamed: 0,categories,text,document vector
1,[Etat],App sei nicht so angenommen worden wie geplant...,"[0.2884349889521088, 0.3827846588433853, 0.191..."
2,[Etat],"Zum Welttag der Suizidprävention ist es Zeit, ...","[0.059306005182276876, 0.2999477057296428, 0.1..."
3,[Etat],Mitarbeiter überreichten Eigentümervertretern ...,"[0.22735203492144743, 0.22006606158890551, 0.2..."
4,[Etat],Service: Jobwechsel in der Kommunikationsbranc...,"[0.3180735347094667, 0.6894320094038565, 0.077..."
5,[Etat],Was Sie über diese Woche wissen sollten - und ...,"[0.17366859602043405, 0.4706323624510939, 0.12..."
...,...,...,...
10262,[Wissenschaft],"Archäologin: ""Einige Monumente wie der Torboge...","[0.2167395314289024, 0.347467121767113, 0.2140..."
10263,[Wissenschaft],800 Wissenschafter zu großer Konferenz in Wien...,"[0.19033763801208, 0.2725844602324684, 0.18755..."
10264,[Wissenschaft],Vor seinem Untergang befand sich das Schiff de...,"[0.2244192346234817, 0.340721504364432, 0.1929..."
10270,[Wissenschaft],Die zentrale Frage des Projekts: Siedelten Ägy...,"[0.1363486554628859, 0.28363551026801964, 0.18..."


In [8]:
# Training...
new_model_id = Classifier.create_model(train_vec_cats_out, fv_col_name='document vector', cat_v_col_name='category vector')
new_model_id

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'articles_train_keras_nn_001'

In [9]:
# Testing...
classified_df = Classifier.classify(test_vec)
classified_df

Unnamed: 0,categories,text,document vector,classification output
1,[Etat],App sei nicht so angenommen worden wie geplant...,"[0.2884349889521088, 0.3827846588433853, 0.191...","[0.84061253, 0.84090406, 0.94367385, -0.107084..."
2,[Etat],"Zum Welttag der Suizidprävention ist es Zeit, ...","[0.059306005182276876, 0.2999477057296428, 0.1...","[0.53088236, 0.5304187, 0.8797483, 0.2492378, ..."
3,[Etat],Mitarbeiter überreichten Eigentümervertretern ...,"[0.22735203492144743, 0.22006606158890551, 0.2...","[0.75466776, 0.6817816, 0.7758203, -0.00133423..."
4,[Etat],Service: Jobwechsel in der Kommunikationsbranc...,"[0.3180735347094667, 0.6894320094038565, 0.077...","[0.62585014, 0.66406876, 0.95136386, 0.3256631..."
5,[Etat],Was Sie über diese Woche wissen sollten - und ...,"[0.17366859602043405, 0.4706323624510939, 0.12...","[0.57008237, 0.5645143, 0.8945138, 0.32616675,..."
...,...,...,...,...
10262,[Wissenschaft],"Archäologin: ""Einige Monumente wie der Torboge...","[0.2167395314289024, 0.347467121767113, 0.2140...","[0.7029113, 0.67211354, 0.8243644, 0.1789607, ..."
10263,[Wissenschaft],800 Wissenschafter zu großer Konferenz in Wien...,"[0.19033763801208, 0.2725844602324684, 0.18755...","[0.6597572, 0.59114, 0.71843415, 0.039793275, ..."
10264,[Wissenschaft],Vor seinem Untergang befand sich das Schiff de...,"[0.2244192346234817, 0.340721504364432, 0.1929...","[0.7081508, 0.68526673, 0.82491165, 0.01176454..."
10270,[Wissenschaft],Die zentrale Frage des Projekts: Siedelten Ägy...,"[0.1363486554628859, 0.28363551026801964, 0.18...","[0.63615185, 0.57807386, 0.76437354, 0.0199115..."


In [10]:
classified_df.at[1,'classification output']

array([ 0.84061253,  0.84090406,  0.94367385, -0.10708471,  0.6469066 ,
        0.10927326,  0.74465126,  0.52360886, -1.044111  , -0.3055527 ,
        0.89293814,  0.7883389 , -0.39233035, -0.20120534, -1.039483  ,
       -0.20099604,  0.02337285, -0.70523965,  2.094912  ,  0.08424125,
       -0.13523097,  1.9906608 , -0.10628876, -0.36551887,  0.95211226,
        0.08876705, -0.44548202,  0.78281784, -0.64203805, -1.8766586 ,
        1.7029634 ,  2.2068903 ,  0.0698546 , -0.38563886,  1.19199   ,
       -0.7456083 , -0.44519556, -0.5473114 ,  0.48538974,  0.1876628 ,
        3.6918128 ,  2.2655394 ,  0.25578314, -0.22193083,  0.60012513,
        1.0720999 , -0.87098664,  1.2353083 , -0.61001974, -0.08325816,
       -0.07065292, -1.4331573 ,  0.11759236,  3.2259538 ,  1.3035096 ,
        0.14634882,  2.1039507 ,  0.3755869 ,  0.15571786, -0.560899  ,
       -1.0202532 , -0.79028636,  0.45973206,  1.3772029 ,  0.6954886 ,
       -1.3735331 ,  0.25677803, -2.1887748 ,  1.6327817 ,  0.21

In [11]:
interpreted = ClassificationInterpreter.interpret_output(classified_df)
interpreted

Unnamed: 0,categories,text,document vector,classification output,result
1,[Etat],App sei nicht so angenommen worden wie geplant...,"[0.2884349889521088, 0.3827846588433853, 0.191...","[0.84061253, 0.84090406, 0.94367385, -0.107084...",[Web]
2,[Etat],"Zum Welttag der Suizidprävention ist es Zeit, ...","[0.059306005182276876, 0.2999477057296428, 0.1...","[0.53088236, 0.5304187, 0.8797483, 0.2492378, ...",[Panorama]
3,[Etat],Mitarbeiter überreichten Eigentümervertretern ...,"[0.22735203492144743, 0.22006606158890551, 0.2...","[0.75466776, 0.6817816, 0.7758203, -0.00133423...",[Inland]
4,[Etat],Service: Jobwechsel in der Kommunikationsbranc...,"[0.3180735347094667, 0.6894320094038565, 0.077...","[0.62585014, 0.66406876, 0.95136386, 0.3256631...",[Panorama]
5,[Etat],Was Sie über diese Woche wissen sollten - und ...,"[0.17366859602043405, 0.4706323624510939, 0.12...","[0.57008237, 0.5645143, 0.8945138, 0.32616675,...",[Panorama]
...,...,...,...,...,...
10262,[Wissenschaft],"Archäologin: ""Einige Monumente wie der Torboge...","[0.2167395314289024, 0.347467121767113, 0.2140...","[0.7029113, 0.67211354, 0.8243644, 0.1789607, ...",[International]
10263,[Wissenschaft],800 Wissenschafter zu großer Konferenz in Wien...,"[0.19033763801208, 0.2725844602324684, 0.18755...","[0.6597572, 0.59114, 0.71843415, 0.039793275, ...",[Inland]
10264,[Wissenschaft],Vor seinem Untergang befand sich das Schiff de...,"[0.2244192346234817, 0.340721504364432, 0.1929...","[0.7081508, 0.68526673, 0.82491165, 0.01176454...",[Web]
10270,[Wissenschaft],Die zentrale Frage des Projekts: Siedelten Ägy...,"[0.1363486554628859, 0.28363551026801964, 0.18...","[0.63615185, 0.57807386, 0.76437354, 0.0199115...",[Inland]


In [12]:
score = ClassificationInterpreter.evaluate_output(interpreted)
score

0.44831613782363244

In [13]:
#EvaluationHandler.clear_evaluations()
EvaluationHandler.add_evaluation(score)

In [14]:
evaluations = EvaluationHandler.load_evaluations()
evaluations

Unnamed: 0,timestamp,session id,config id,score
0,2020-02-19 00:46:17,session_001,session_config_0001,0.436052
0,2020-02-19 01:27:46,session_001,session_config_0001,0.448316


In [15]:
best_score = EvaluationHandler.compare_evaluations()
best_score

Unnamed: 0,timestamp,session id,config id,score
0,2020-02-19 01:27:46,session_001,session_config_0001,0.448316
