<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#packages" data-toc-modified-id="packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>packages</a></span></li><li><span><a href="#get-clean-data" data-toc-modified-id="get-clean-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>get clean data</a></span><ul class="toc-item"><li><span><a href="#get-raw-data" data-toc-modified-id="get-raw-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>get raw data</a></span></li><li><span><a href="#generate-df_liwc_sia" data-toc-modified-id="generate-df_liwc_sia-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>generate df_liwc_sia</a></span></li><li><span><a href="#test" data-toc-modified-id="test-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>test</a></span><ul class="toc-item"><li><span><a href="#impact-of-stopword-removal" data-toc-modified-id="impact-of-stopword-removal-2.3.1"><span class="toc-item-num">2.3.1&nbsp;&nbsp;</span>impact of stopword removal</a></span></li></ul></li><li><span><a href="#note" data-toc-modified-id="note-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>note</a></span></li></ul></li><li><span><a href="#tokenize-text-data" data-toc-modified-id="tokenize-text-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>tokenize text data</a></span><ul class="toc-item"><li><span><a href="#prepare-data" data-toc-modified-id="prepare-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>prepare data</a></span></li><li><span><a href="#tokenize-text-with-gensim" data-toc-modified-id="tokenize-text-with-gensim-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>tokenize text with gensim</a></span></li></ul></li><li><span><a href="#get-bi_weapon_array" data-toc-modified-id="get-bi_weapon_array-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>get <code>bi_weapon_array</code></a></span></li><li><span><a href="#cross-validation-&amp;-grid-search" data-toc-modified-id="cross-validation-&amp;-grid-search-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>cross validation &amp; grid search</a></span><ul class="toc-item"><li><span><a href="#grid-search" data-toc-modified-id="grid-search-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>grid search</a></span></li><li><span><a href="#optimal-run" data-toc-modified-id="optimal-run-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>optimal run</a></span></li></ul></li><li><span><a href="#test" data-toc-modified-id="test-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>test</a></span></li></ul></div>

In [1]:
## Pre-setting
# automatically adjust the width of the notebook code cell
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
# if one module is changed, this line will automatically reload that module
%load_ext autoreload
%autoreload 2
# display the figure in the notebook
%matplotlib inline
# To change the font size in acrobat
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

## packages

In [2]:
## Add path
import os
import sys
src_dir = os.path.abspath(os.path.join(os.pardir, 'src'))
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

In [3]:
import json
import numpy as np
import gensim as gs
from collections import Counter
import itertools

# from sklearn.cross_validation import train_test_split # for old version sklearn <= 0.17.1
from sklearn.model_selection import train_test_split, KFold # for old version sklearn

from gensim.models.ldamodel import LdaModel




import pandas as pd
pd.options.display.max_columns = 500

In [4]:
from sklearn.metrics import f1_score

from sklearn.metrics import label_ranking_average_precision_score, label_ranking_loss

from sklearn.ensemble import RandomForestClassifier

In [5]:
from evaluation.evaluation import get_label_via_training

In [6]:
from evaluation.evaluation import get_label_via_training, doc_class_evaluation_fscore, baseline_doc_class_evaluation_fscore

In [7]:
from obtainLDA.obtainLDA import get_ptd_from_lda

In [8]:
from obtainLDA.lda_liwc_sia_learn import run_lda_liwc_sia_rf

## get clean data

### get raw data

In [9]:
email_6p2_folder = os.path.abspath(os.path.join(os.pardir, 'data', 's2021_lumen_clean_data'))
email_6p2_file = 's2021_06_20_01_lumen_clean_doc_sia_liwc_classify.csv'
email_6p2_location = os.path.join(email_6p2_folder, email_6p2_file)

In [10]:
email_6p2_df = pd.read_csv(email_6p2_location)

In [11]:
email_6p2_df.columns

Index(['raw_text_id', 'raw_text', 'text_type', 'nostop_stem_doc',
       'nostop_stem_doc_len', 'clean_doc', 'clean_doc_len', 'pos_sia',
       'compound_sia', 'neu_sia', 'neg_sia', 'posemo_liwc', 'negemo_liwc',
       'anx_liwc', 'anger_liwc', 'sad_liwc', 'reward_liwc', 'risk_liwc',
       'time_liwc', 'money_liwc', 'Authority or Expertise/Source Credibility',
       'Blame/guilt', 'Commitment', 'Commitment- Call to Action',
       'Commitment- Indignation', 'Emphasis', 'Gain framing', 'Liking',
       'Loss framing', 'Objectivity', 'Reciprocation',
       'Scarcity/Urgency/Opportunity', 'Social Proof',
       'Social Proof- Admonition', 'Subjectivity'],
      dtype='object')

### generate df_liwc_sia

In [12]:
[i for i in email_6p2_df.columns if '_liwc' in i]

['posemo_liwc',
 'negemo_liwc',
 'anx_liwc',
 'anger_liwc',
 'sad_liwc',
 'reward_liwc',
 'risk_liwc',
 'time_liwc',
 'money_liwc']

In [13]:
[i for i in email_6p2_df.columns if '_sia' in i]

['pos_sia', 'compound_sia', 'neu_sia', 'neg_sia']

In [14]:
liwc_list = ['anx_liwc', 'anger_liwc', 'sad_liwc', 'reward_liwc', 'risk_liwc', 'time_liwc', 'money_liwc']
len(liwc_list)

7

In [15]:
sia_list = ['pos_sia', 'neg_sia']
len(sia_list)

2

In [16]:
liwc_sia_list = liwc_list + sia_list
liwc_sia_list, len(liwc_sia_list)

(['anx_liwc',
  'anger_liwc',
  'sad_liwc',
  'reward_liwc',
  'risk_liwc',
  'time_liwc',
  'money_liwc',
  'pos_sia',
  'neg_sia'],
 9)

In [17]:
for tmp_liwc in liwc_list:
    email_6p2_df[tmp_liwc] = email_6p2_df[tmp_liwc] / email_6p2_df.nostop_stem_doc_len * 100

In [18]:
email_6p2_df.head()

Unnamed: 0,raw_text_id,raw_text,text_type,nostop_stem_doc,nostop_stem_doc_len,clean_doc,clean_doc_len,pos_sia,compound_sia,neu_sia,neg_sia,posemo_liwc,negemo_liwc,anx_liwc,anger_liwc,sad_liwc,reward_liwc,risk_liwc,time_liwc,money_liwc,Authority or Expertise/Source Credibility,Blame/guilt,Commitment,Commitment- Call to Action,Commitment- Indignation,Emphasis,Gain framing,Liking,Loss framing,Objectivity,Reciprocation,Scarcity/Urgency/Opportunity,Social Proof,Social Proof- Admonition,Subjectivity
0,0,"""A Baker Swept By,"" by Edward Hirsch Audio: Re...",news left,baker swept edward hirsch audio read author al...,69,a baker swept by by edward hirsch audio read b...,137,0.075,0.7506,0.905,0.02,4,1,0.0,0.0,1.449275,0.0,1.449275,20.289855,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,1,"""Get Out"" Won’t Have A 100% Rating On Rotten T...",news left,get rate rotten tomato ever fact jordan peel g...,99,get out won t have a rating on rotten tomatoes...,194,0.115,-0.5607,0.772,0.113,6,6,0.0,0.0,0.0,7.070707,1.010101,10.10101,0.0,1,1,1,0,0,0,0,1,0,1,0,0,0,0,1
2,3,"""Know Your Rights or Your Safety Is At Risk In...",russian ad,know right safeti risk interact polic shock vi...,76,know your rights or your safety is at risk in ...,146,0.068,-0.8751,0.79,0.142,4,2,1.315789,0.0,0.0,2.631579,5.263158,3.947368,1.315789,0,1,1,1,1,1,0,0,0,1,0,0,1,1,1
3,4,"""Nancy Pelosi was drunk again today,"" begins a...",fake news,nanci pelosi drunk today begin post recent sha...,90,nancy pelosi was drunk again today begins a po...,172,0.078,-0.1027,0.831,0.092,6,1,0.0,0.0,0.0,5.555556,0.0,12.222222,1.111111,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
4,5,"""Obama out"": POTUS ends speech with viral mic ...",news left,obama potu end speech viral mic drop presid ba...,58,obama out potus ends speech with viral mic dro...,105,0.047,0.1779,0.913,0.04,2,1,0.0,0.0,0.0,1.724138,0.0,10.344828,0.0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,1


In [19]:
df_liwc_sia = email_6p2_df[liwc_sia_list]

In [20]:
print(df_liwc_sia.shape)
df_liwc_sia.head()

(2771, 9)


Unnamed: 0,anx_liwc,anger_liwc,sad_liwc,reward_liwc,risk_liwc,time_liwc,money_liwc,pos_sia,neg_sia
0,0.0,0.0,1.449275,0.0,1.449275,20.289855,0.0,0.075,0.02
1,0.0,0.0,0.0,7.070707,1.010101,10.10101,0.0,0.115,0.113
2,1.315789,0.0,0.0,2.631579,5.263158,3.947368,1.315789,0.068,0.142
3,0.0,0.0,0.0,5.555556,0.0,12.222222,1.111111,0.078,0.092
4,0.0,0.0,0.0,1.724138,0.0,10.344828,0.0,0.047,0.04


In [21]:
liwc_sia_array = df_liwc_sia.values

In [22]:
liwc_sia_array.shape

(2771, 9)

### test

In [23]:
print(email_6p2_df.shape)
email_6p2_df.head()

(2771, 35)


Unnamed: 0,raw_text_id,raw_text,text_type,nostop_stem_doc,nostop_stem_doc_len,clean_doc,clean_doc_len,pos_sia,compound_sia,neu_sia,neg_sia,posemo_liwc,negemo_liwc,anx_liwc,anger_liwc,sad_liwc,reward_liwc,risk_liwc,time_liwc,money_liwc,Authority or Expertise/Source Credibility,Blame/guilt,Commitment,Commitment- Call to Action,Commitment- Indignation,Emphasis,Gain framing,Liking,Loss framing,Objectivity,Reciprocation,Scarcity/Urgency/Opportunity,Social Proof,Social Proof- Admonition,Subjectivity
0,0,"""A Baker Swept By,"" by Edward Hirsch Audio: Re...",news left,baker swept edward hirsch audio read author al...,69,a baker swept by by edward hirsch audio read b...,137,0.075,0.7506,0.905,0.02,4,1,0.0,0.0,1.449275,0.0,1.449275,20.289855,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,1,"""Get Out"" Won’t Have A 100% Rating On Rotten T...",news left,get rate rotten tomato ever fact jordan peel g...,99,get out won t have a rating on rotten tomatoes...,194,0.115,-0.5607,0.772,0.113,6,6,0.0,0.0,0.0,7.070707,1.010101,10.10101,0.0,1,1,1,0,0,0,0,1,0,1,0,0,0,0,1
2,3,"""Know Your Rights or Your Safety Is At Risk In...",russian ad,know right safeti risk interact polic shock vi...,76,know your rights or your safety is at risk in ...,146,0.068,-0.8751,0.79,0.142,4,2,1.315789,0.0,0.0,2.631579,5.263158,3.947368,1.315789,0,1,1,1,1,1,0,0,0,1,0,0,1,1,1
3,4,"""Nancy Pelosi was drunk again today,"" begins a...",fake news,nanci pelosi drunk today begin post recent sha...,90,nancy pelosi was drunk again today begins a po...,172,0.078,-0.1027,0.831,0.092,6,1,0.0,0.0,0.0,5.555556,0.0,12.222222,1.111111,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
4,5,"""Obama out"": POTUS ends speech with viral mic ...",news left,obama potu end speech viral mic drop presid ba...,58,obama out potus ends speech with viral mic dro...,105,0.047,0.1779,0.913,0.04,2,1,0.0,0.0,0.0,1.724138,0.0,10.344828,0.0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,1


In [24]:
email_6p2_df.columns

Index(['raw_text_id', 'raw_text', 'text_type', 'nostop_stem_doc',
       'nostop_stem_doc_len', 'clean_doc', 'clean_doc_len', 'pos_sia',
       'compound_sia', 'neu_sia', 'neg_sia', 'posemo_liwc', 'negemo_liwc',
       'anx_liwc', 'anger_liwc', 'sad_liwc', 'reward_liwc', 'risk_liwc',
       'time_liwc', 'money_liwc', 'Authority or Expertise/Source Credibility',
       'Blame/guilt', 'Commitment', 'Commitment- Call to Action',
       'Commitment- Indignation', 'Emphasis', 'Gain framing', 'Liking',
       'Loss framing', 'Objectivity', 'Reciprocation',
       'Scarcity/Urgency/Opportunity', 'Social Proof',
       'Social Proof- Admonition', 'Subjectivity'],
      dtype='object')

In [25]:
[i for i in email_6p2_df.columns if '_liwc' in i]

['posemo_liwc',
 'negemo_liwc',
 'anx_liwc',
 'anger_liwc',
 'sad_liwc',
 'reward_liwc',
 'risk_liwc',
 'time_liwc',
 'money_liwc']

#### impact of stopword removal

In [26]:
doc_list_nostop_stem = [i.split() for i in email_6p2_df.nostop_stem_doc]

In [27]:
all_word_list_nostop_stem = [j for i in doc_list_nostop_stem for j in i]

In [28]:
len(all_word_list_nostop_stem), len(set(all_word_list_nostop_stem))

(183442, 14938)

In [29]:
doc_list_stop_nostem = [i.split() for i in email_6p2_df.clean_doc]

In [30]:

all_word_list_stop_nostem = [j for i in doc_list_stop_nostem for j in i]

In [31]:
len(all_word_list_stop_nostem), len(set(all_word_list_stop_nostem))

(316088, 21328)

In [32]:
len(all_word_list_stop_nostem) / len(all_word_list_nostop_stem) - 1, len(set(all_word_list_stop_nostem)) / len(set(all_word_list_nostop_stem)) - 1

(0.7230950382137133, 0.4277681081804794)

In [33]:
doc_len_list_nostop_stem = [len(i) for i in doc_list_nostop_stem]

In [34]:
doc_len_list_stop_nostem = [len(i) for i in doc_list_stop_nostem]

In [35]:
doc_len_nostop_stem_array = np.array(doc_len_list_nostop_stem)

In [36]:
doc_len_nostop_stem_array.sum()

183442

In [37]:
doc_len_nostop_stem_array.mean(), np.median(doc_len_nostop_stem_array)

(66.20064958498737, 66.0)

In [38]:
doc_len_stop_nostem_array = np.array(doc_len_list_stop_nostem)

In [39]:
doc_len_stop_nostem_array.mean(), np.median(doc_len_stop_nostem_array)

(114.07001082641645, 113.0)

### note

- there is no short pre-processed doc
- there is 1,763 emails in total

## tokenize text data

### prepare data

In [40]:
doc_list = [i.split() for i in email_6p2_df.nostop_stem_doc]

In [41]:
len(doc_list)

2771

In [42]:
doc_list[:3]

[['baker',
  'swept',
  'edward',
  'hirsch',
  'audio',
  'read',
  'author',
  'alreadi',
  'lose',
  'eyesight',
  'last',
  'winter',
  'rome',
  'paus',
  'doorway',
  'nine',
  'clock',
  'saturday',
  'morn',
  'baker',
  'swept',
  'shini',
  'bicycl',
  'wave',
  'cap',
  'sing',
  'breath',
  'know',
  'baker',
  'wore',
  'white',
  'apron',
  'dust',
  'flour',
  'float',
  'around',
  'citi',
  'like',
  'angel',
  'freshli',
  'bake',
  'day',
  'sure',
  'morn',
  'halt',
  'street',
  'stood',
  'doorway',
  'baker',
  'wing',
  'weekend',
  'morn',
  'new',
  'pristin',
  'look',
  'sky',
  'one',
  'undiminish',
  'instant',
  'misplac',
  'time',
  'saw',
  'bright',
  'bright',
  'everywher',
  'shadow',
  'cross',
  'rooftop',
  'blot'],
 ['get',
  'rate',
  'rotten',
  'tomato',
  'ever',
  'fact',
  'jordan',
  'peel',
  'get',
  'one',
  'film',
  'thriller',
  'sit',
  'impress',
  'fresh',
  'rotten',
  'tomato',
  'one',
  'point',
  'film',
  'held',
  'stea

### tokenize text with gensim

In [43]:
dictionary = gs.corpora.Dictionary(doc_list)

In [44]:
len(dictionary.keys())

14938

In [45]:
corpus = [dictionary.doc2bow(text) for text in doc_list]

In [46]:
len(corpus)

2771

## get `bi_weapon_array`

In [47]:
list(email_6p2_df.columns[-15:])

['Authority or Expertise/Source Credibility',
 'Blame/guilt',
 'Commitment',
 'Commitment- Call to Action',
 'Commitment- Indignation',
 'Emphasis',
 'Gain framing',
 'Liking',
 'Loss framing',
 'Objectivity',
 'Reciprocation',
 'Scarcity/Urgency/Opportunity',
 'Social Proof',
 'Social Proof- Admonition',
 'Subjectivity']

In [48]:
'Authority or Expertise/Source Credibility',
'Commitment',
'Commitment- Call to Action',
'Subjectivity',
'Gain framing',
'Blame/guilt',
'Emphasis',


('Emphasis',)

In [49]:
'Authority or Expertise/Source Credibility', 'Commitment', 'Liking', 'Social Proof', 'Scarcity/Urgency/Opportunity', 'Gain framing', 'Loss framing',

('Authority or Expertise/Source Credibility',
 'Commitment',
 'Liking',
 'Social Proof',
 'Scarcity/Urgency/Opportunity',
 'Gain framing',
 'Loss framing')

In [50]:
# 2021-06-30
influence_list = [
    'Authority or Expertise/Source Credibility',
    'Commitment',
    'Commitment- Call to Action',
    'Subjectivity',
    'Gain framing',
    'Blame/guilt',
    'Emphasis',
]

influence_list

['Authority or Expertise/Source Credibility',
 'Commitment',
 'Commitment- Call to Action',
 'Subjectivity',
 'Gain framing',
 'Blame/guilt',
 'Emphasis']

In [51]:
bi_weapon_array = email_6p2_df[influence_list].values.astype(int)

In [52]:
print(bi_weapon_array.shape)
bi_weapon_array

(2771, 7)


array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 1, 0],
       [0, 1, 1, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [53]:
bi_weapon_array[0]

array([0, 0, 0, 1, 0, 0, 0])

## cross validation & grid search

### grid search

In [54]:
num_topics_list = [10,  50, 100]
n_estimators_list = [50, 100, 200]

In [55]:
%%time

max_f1_score_micro = 0
optimal_parameter_dic = {}

for num_topics, n_estimators in itertools.product(num_topics_list, n_estimators_list):
    print(num_topics, n_estimators)
    
    df_result_f1_score_save = run_lda_liwc_sia_rf(doc_list, liwc_sia_array, bi_weapon_array, num_topics=num_topics, n_estimators=n_estimators)
    
    print(df_result_f1_score_save.mean())
    
    tmp_f1_score_micro = df_result_f1_score_save.mean().f1_score_micro
    
    if tmp_f1_score_micro > max_f1_score_micro * 1.001:
        max_f1_score_micro = tmp_f1_score_micro
        optimal_parameter_dic['num_topics'] = num_topics
        optimal_parameter_dic['n_estimators'] = n_estimators
        


10 50
len of dictionary.keys:  13352


  doc_array = np.array(doc_list)


len of dictionary.keys:  13305
len of dictionary.keys:  13337
len of dictionary.keys:  13484
len of dictionary.keys:  13418
acc_score         0.723153
f1_score_macro    0.580004
f1_score_micro    0.686855
dtype: float64
10 100
len of dictionary.keys:  13419


  doc_array = np.array(doc_list)


len of dictionary.keys:  13453
len of dictionary.keys:  13487
len of dictionary.keys:  13199
len of dictionary.keys:  13403
acc_score         0.720519
f1_score_macro    0.578404
f1_score_micro    0.686856
dtype: float64
10 200


  doc_array = np.array(doc_list)


len of dictionary.keys:  13394
len of dictionary.keys:  13332
len of dictionary.keys:  13363
len of dictionary.keys:  13408
len of dictionary.keys:  13398
acc_score         0.722431
f1_score_macro    0.583675
f1_score_micro    0.690359
dtype: float64
50 50
len of dictionary.keys:  13370


  doc_array = np.array(doc_list)


len of dictionary.keys:  13420
len of dictionary.keys:  13445
len of dictionary.keys:  13352
len of dictionary.keys:  13328
acc_score         0.717484
f1_score_macro    0.567411
f1_score_micro    0.681112
dtype: float64
50 100


  doc_array = np.array(doc_list)


len of dictionary.keys:  13284
len of dictionary.keys:  13539
len of dictionary.keys:  13284
len of dictionary.keys:  13409
len of dictionary.keys:  13426
acc_score         0.721867
f1_score_macro    0.571428
f1_score_micro    0.686420
dtype: float64
50 200
len of dictionary.keys:  13313


  doc_array = np.array(doc_list)


len of dictionary.keys:  13456
len of dictionary.keys:  13336
len of dictionary.keys:  13354
len of dictionary.keys:  13465
acc_score         0.722067
f1_score_macro    0.569449
f1_score_micro    0.686604
dtype: float64
100 50
len of dictionary.keys:  13371


  doc_array = np.array(doc_list)


len of dictionary.keys:  13483
len of dictionary.keys:  13366
len of dictionary.keys:  13286
len of dictionary.keys:  13419
acc_score         0.715006
f1_score_macro    0.561255
f1_score_micro    0.675798
dtype: float64
100 100
len of dictionary.keys:  13435


  doc_array = np.array(doc_list)


len of dictionary.keys:  13269
len of dictionary.keys:  13472
len of dictionary.keys:  13336
len of dictionary.keys:  13388
acc_score         0.719286
f1_score_macro    0.565551
f1_score_micro    0.681581
dtype: float64
100 200
len of dictionary.keys:  13330


  doc_array = np.array(doc_list)


len of dictionary.keys:  13532
len of dictionary.keys:  13258
len of dictionary.keys:  13376
len of dictionary.keys:  13445
acc_score         0.716914
f1_score_macro    0.561294
f1_score_micro    0.680062
dtype: float64
CPU times: user 8min 7s, sys: 41.3 s, total: 8min 49s
Wall time: 6min 6s


### optimal run

In [56]:
max_f1_score_micro

0.6903589752973892

In [57]:
optimal_parameter_dic

{'num_topics': 10, 'n_estimators': 200}

In [58]:
num_topics = optimal_parameter_dic['num_topics']
n_estimators = optimal_parameter_dic['n_estimators']

In [59]:
df_result_f1_score_save = run_lda_liwc_sia_rf(doc_list, liwc_sia_array, bi_weapon_array, num_topics=num_topics, n_estimators=n_estimators)

  doc_array = np.array(doc_list)


len of dictionary.keys:  13347
len of dictionary.keys:  13323
len of dictionary.keys:  13358
len of dictionary.keys:  13447
len of dictionary.keys:  13453


In [60]:
df_result_f1_score_save

Unnamed: 0,acc_score,f1_score_macro,f1_score_micro
0,0.727156,0.595442,0.697143
1,0.723053,0.5818,0.69244
2,0.723053,0.586928,0.695751
3,0.72228,0.570969,0.686828
4,0.726147,0.580104,0.689836


In [61]:
df_result_f1_score_save.mean()

acc_score         0.724338
f1_score_macro    0.583049
f1_score_micro    0.692399
dtype: float64

In [62]:
%%time
import beepy
beepy.beep(1)

CPU times: user 27.8 ms, sys: 20.6 ms, total: 48.4 ms
Wall time: 718 ms


## test