<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#packages" data-toc-modified-id="packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>packages</a></span></li><li><span><a href="#get-clean-data" data-toc-modified-id="get-clean-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>get clean data</a></span><ul class="toc-item"><li><span><a href="#get-raw-data" data-toc-modified-id="get-raw-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>get raw data</a></span></li><li><span><a href="#generate-df_liwc_sia" data-toc-modified-id="generate-df_liwc_sia-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>generate df_liwc_sia</a></span></li></ul></li><li><span><a href="#tokenize-text-data" data-toc-modified-id="tokenize-text-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>tokenize text data</a></span><ul class="toc-item"><li><span><a href="#prepare-data" data-toc-modified-id="prepare-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>prepare data</a></span></li><li><span><a href="#tokenize-text-with-gensim" data-toc-modified-id="tokenize-text-with-gensim-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>tokenize text with gensim</a></span></li></ul></li><li><span><a href="#get-bi_weapon_array" data-toc-modified-id="get-bi_weapon_array-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>get <code>bi_weapon_array</code></a></span></li><li><span><a href="#cross-validation-&amp;-grid-search" data-toc-modified-id="cross-validation-&amp;-grid-search-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>cross validation &amp; grid search</a></span><ul class="toc-item"><li><span><a href="#optimal-run:-train-the-model-with-all-documents" data-toc-modified-id="optimal-run:-train-the-model-with-all-documents-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>optimal run: train the model with all documents</a></span></li><li><span><a href="#get-feature-importance" data-toc-modified-id="get-feature-importance-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>get feature importance</a></span></li><li><span><a href="#pick-some-topics" data-toc-modified-id="pick-some-topics-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>pick some topics</a></span></li></ul></li><li><span><a href="#test" data-toc-modified-id="test-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>test</a></span></li></ul></div>

In [1]:
## Pre-setting
# automatically adjust the width of the notebook code cell
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
# if one module is changed, this line will automatically reload that module
%load_ext autoreload
%autoreload 2
# display the figure in the notebook
%matplotlib inline
# To change the font size in acrobat
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

## packages

In [2]:
## Add path
import os
import sys
src_dir = os.path.abspath(os.path.join(os.pardir, 'src'))
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

In [3]:
import json
import numpy as np
import gensim as gs
from collections import Counter
import itertools

# from sklearn.cross_validation import train_test_split # for old version sklearn <= 0.17.1
from sklearn.model_selection import train_test_split, KFold # for old version sklearn

from gensim.models.ldamodel import LdaModel




import pandas as pd
pd.options.display.max_columns = 500

In [4]:
from sklearn.metrics import f1_score

from sklearn.metrics import label_ranking_average_precision_score, label_ranking_loss

from sklearn.ensemble import RandomForestClassifier

In [5]:
from evaluation.evaluation import get_label_via_training

In [6]:
from evaluation.evaluation import get_label_via_training, doc_class_evaluation_fscore, baseline_doc_class_evaluation_fscore

In [7]:
from obtainLDA.obtainLDA import get_ptd_from_lda

In [8]:
from obtainLDA.lda_liwc_sia_learn import run_lda_liwc_sia_rf

## get clean data

### get raw data

In [9]:
email_6p2_folder = os.path.abspath(os.path.join(os.pardir, 'data', 's2021_lumen_clean_data'))
email_6p2_file = 's2021_06_20_01_lumen_clean_doc_sia_liwc_classify.csv'
email_6p2_location = os.path.join(email_6p2_folder, email_6p2_file)

In [10]:
email_6p2_df = pd.read_csv(email_6p2_location)

### generate df_liwc_sia

In [11]:
[i for i in email_6p2_df.columns if '_liwc' in i]

['posemo_liwc',
 'negemo_liwc',
 'anx_liwc',
 'anger_liwc',
 'sad_liwc',
 'reward_liwc',
 'risk_liwc',
 'time_liwc',
 'money_liwc']

In [12]:
[i for i in email_6p2_df.columns if '_sia' in i]

['pos_sia', 'compound_sia', 'neu_sia', 'neg_sia']

In [13]:
liwc_list = ['anx_liwc', 'anger_liwc', 'sad_liwc', 'reward_liwc', 'risk_liwc', 'time_liwc', 'money_liwc']
len(liwc_list)

7

In [14]:
sia_list = ['pos_sia', 'neg_sia']
len(sia_list)

2

In [15]:
liwc_sia_list = liwc_list + sia_list
liwc_sia_list, len(liwc_sia_list)

(['anx_liwc',
  'anger_liwc',
  'sad_liwc',
  'reward_liwc',
  'risk_liwc',
  'time_liwc',
  'money_liwc',
  'pos_sia',
  'neg_sia'],
 9)

In [16]:
for tmp_liwc in liwc_list:
    email_6p2_df[tmp_liwc] = email_6p2_df[tmp_liwc] / email_6p2_df.nostop_stem_doc_len * 100

In [17]:
email_6p2_df.head()

Unnamed: 0,raw_text_id,raw_text,text_type,nostop_stem_doc,nostop_stem_doc_len,clean_doc,clean_doc_len,pos_sia,compound_sia,neu_sia,neg_sia,posemo_liwc,negemo_liwc,anx_liwc,anger_liwc,sad_liwc,reward_liwc,risk_liwc,time_liwc,money_liwc,Authority or Expertise/Source Credibility,Blame/guilt,Commitment,Commitment- Call to Action,Commitment- Indignation,Emphasis,Gain framing,Liking,Loss framing,Objectivity,Reciprocation,Scarcity/Urgency/Opportunity,Social Proof,Social Proof- Admonition,Subjectivity
0,0,"""A Baker Swept By,"" by Edward Hirsch Audio: Re...",news left,baker swept edward hirsch audio read author al...,69,a baker swept by by edward hirsch audio read b...,137,0.075,0.7506,0.905,0.02,4,1,0.0,0.0,1.449275,0.0,1.449275,20.289855,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,1,"""Get Out"" Won’t Have A 100% Rating On Rotten T...",news left,get rate rotten tomato ever fact jordan peel g...,99,get out won t have a rating on rotten tomatoes...,194,0.115,-0.5607,0.772,0.113,6,6,0.0,0.0,0.0,7.070707,1.010101,10.10101,0.0,1,1,1,0,0,0,0,1,0,1,0,0,0,0,1
2,3,"""Know Your Rights or Your Safety Is At Risk In...",russian ad,know right safeti risk interact polic shock vi...,76,know your rights or your safety is at risk in ...,146,0.068,-0.8751,0.79,0.142,4,2,1.315789,0.0,0.0,2.631579,5.263158,3.947368,1.315789,0,1,1,1,1,1,0,0,0,1,0,0,1,1,1
3,4,"""Nancy Pelosi was drunk again today,"" begins a...",fake news,nanci pelosi drunk today begin post recent sha...,90,nancy pelosi was drunk again today begins a po...,172,0.078,-0.1027,0.831,0.092,6,1,0.0,0.0,0.0,5.555556,0.0,12.222222,1.111111,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
4,5,"""Obama out"": POTUS ends speech with viral mic ...",news left,obama potu end speech viral mic drop presid ba...,58,obama out potus ends speech with viral mic dro...,105,0.047,0.1779,0.913,0.04,2,1,0.0,0.0,0.0,1.724138,0.0,10.344828,0.0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,1


In [18]:
df_liwc_sia = email_6p2_df[liwc_sia_list]

In [19]:
print(df_liwc_sia.shape)
df_liwc_sia.head()

(2771, 9)


Unnamed: 0,anx_liwc,anger_liwc,sad_liwc,reward_liwc,risk_liwc,time_liwc,money_liwc,pos_sia,neg_sia
0,0.0,0.0,1.449275,0.0,1.449275,20.289855,0.0,0.075,0.02
1,0.0,0.0,0.0,7.070707,1.010101,10.10101,0.0,0.115,0.113
2,1.315789,0.0,0.0,2.631579,5.263158,3.947368,1.315789,0.068,0.142
3,0.0,0.0,0.0,5.555556,0.0,12.222222,1.111111,0.078,0.092
4,0.0,0.0,0.0,1.724138,0.0,10.344828,0.0,0.047,0.04


In [20]:
liwc_sia_array = df_liwc_sia.values

In [21]:
liwc_sia_array.shape

(2771, 9)

In [22]:
liwc_sia_array[0]

array([0.00000000e+00, 0.00000000e+00, 1.44927536e+00, 0.00000000e+00,
       1.44927536e+00, 2.02898551e+01, 0.00000000e+00, 7.50000000e-02,
       2.00000000e-02])

## tokenize text data

### prepare data

In [23]:
doc_list = [i.split() for i in email_6p2_df.nostop_stem_doc]
len(doc_list)

2771

In [24]:
doc_list_lens = [len(i) for i in doc_list]
len(doc_list_lens), doc_list_lens[:10]

(2771, [69, 99, 76, 90, 58, 41, 32, 110, 26, 19])

In [25]:
sum(doc_list_lens)

183442

In [26]:
all_tokens = []

for i in doc_list:
    all_tokens += i

In [27]:
len(all_tokens), len(set(all_tokens))

(183442, 14938)

In [28]:
for tmp_doc in doc_list[:500]:
    if 'font' in tmp_doc:
        print('*'*40)
        print(' '.join(tmp_doc))

### tokenize text with gensim

In [29]:
dictionary = gs.corpora.Dictionary(doc_list)

In [30]:
len(dictionary.keys())

14938

In [31]:
corpus = [dictionary.doc2bow(text) for text in doc_list]

In [32]:
len(corpus)

2771

## get `bi_weapon_array`

In [33]:
# 2021-06-30
influence_list = [
    'Authority or Expertise/Source Credibility',
    'Commitment',
    'Commitment- Call to Action',
    'Subjectivity',
    'Gain framing',
    'Blame/guilt',
    'Emphasis',
]

influence_list

['Authority or Expertise/Source Credibility',
 'Commitment',
 'Commitment- Call to Action',
 'Subjectivity',
 'Gain framing',
 'Blame/guilt',
 'Emphasis']

In [34]:
bi_weapon_array = email_6p2_df[influence_list].values.astype(int)

In [35]:
print(bi_weapon_array.shape)
bi_weapon_array

(2771, 7)


array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 1, 0],
       [0, 1, 1, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [36]:
bi_weapon_array[0]

array([0, 0, 0, 1, 0, 0, 0])

## cross validation & grid search

In [37]:
len(doc_list)

2771

In [38]:
doc_list[1]

['get',
 'rate',
 'rotten',
 'tomato',
 'ever',
 'fact',
 'jordan',
 'peel',
 'get',
 'one',
 'film',
 'thriller',
 'sit',
 'impress',
 'fresh',
 'rotten',
 'tomato',
 'one',
 'point',
 'film',
 'held',
 'steadi',
 'fresh',
 'score',
 'came',
 'halt',
 'one',
 'review',
 'rip',
 'project',
 'nation',
 'review',
 'armond',
 'white',
 'describ',
 'get',
 'headlin',
 'review',
 'return',
 'movi',
 'get',
 'actor',
 'lakeith',
 'stanfield',
 'definit',
 'hide',
 'thought',
 'white',
 'review',
 'twitter',
 'movi',
 'still',
 'sit',
 'fresh',
 'rotten',
 'tomato',
 'even',
 'though',
 'ton',
 'glow',
 'review',
 'sinc',
 'come',
 'fresh',
 'review',
 'one',
 'rotten',
 'one',
 'never',
 'get',
 'back',
 'round',
 'nearest',
 'whole',
 'number',
 'two',
 'except',
 'repres',
 'rotten',
 'tomato',
 'told',
 'buzzfe',
 'news',
 'round',
 'round',
 'reserv',
 'absolut',
 'movi',
 'everi',
 'singl',
 'review',
 'fresh',
 'everi',
 'singl',
 'review',
 'rotten',
 'armond',
 'white',
 'say']

In [39]:
liwc_sia_array.shape

(2771, 9)

In [40]:
bi_weapon_array.shape

(2771, 7)

### optimal run: train the model with all documents

In [41]:
num_topics = 10
n_estimators = 200

In [42]:
doc_array = np.array(doc_list)
doc_array.shape

  doc_array = np.array(doc_list)


(2771,)

In [43]:
doc_array[1]

['get',
 'rate',
 'rotten',
 'tomato',
 'ever',
 'fact',
 'jordan',
 'peel',
 'get',
 'one',
 'film',
 'thriller',
 'sit',
 'impress',
 'fresh',
 'rotten',
 'tomato',
 'one',
 'point',
 'film',
 'held',
 'steadi',
 'fresh',
 'score',
 'came',
 'halt',
 'one',
 'review',
 'rip',
 'project',
 'nation',
 'review',
 'armond',
 'white',
 'describ',
 'get',
 'headlin',
 'review',
 'return',
 'movi',
 'get',
 'actor',
 'lakeith',
 'stanfield',
 'definit',
 'hide',
 'thought',
 'white',
 'review',
 'twitter',
 'movi',
 'still',
 'sit',
 'fresh',
 'rotten',
 'tomato',
 'even',
 'though',
 'ton',
 'glow',
 'review',
 'sinc',
 'come',
 'fresh',
 'review',
 'one',
 'rotten',
 'one',
 'never',
 'get',
 'back',
 'round',
 'nearest',
 'whole',
 'number',
 'two',
 'except',
 'repres',
 'rotten',
 'tomato',
 'told',
 'buzzfe',
 'news',
 'round',
 'round',
 'reserv',
 'absolut',
 'movi',
 'everi',
 'singl',
 'review',
 'fresh',
 'everi',
 'singl',
 'review',
 'rotten',
 'armond',
 'white',
 'say']

In [44]:
liwc_sia_array_train = liwc_sia_array
doc_array_train = doc_array

bi_weapon_array_train = bi_weapon_array

In [45]:
dictionary = 0
dictionary = gs.corpora.Dictionary(doc_array_train)
print('len of dictionary.keys: ', len(dictionary.keys()))

len of dictionary.keys:  14938


In [46]:
%%time
corpus_array_train = [dictionary.doc2bow(text) for text in doc_array_train]
lda = LdaModel(corpus_array_train, num_topics=num_topics, alpha='asymmetric', eval_every=3)

CPU times: user 2.74 s, sys: 13.4 ms, total: 2.75 s
Wall time: 2.75 s


In [47]:
# get ptd for train and test
ptd_train = get_ptd_from_lda(lda, corpus_array_train)

In [48]:
ptd_liwc_sia_train = np.concatenate((ptd_train, liwc_sia_array_train), axis=1)

In [49]:
rfc = RandomForestClassifier(n_estimators=n_estimators, random_state=2)
rfc.fit(ptd_liwc_sia_train, bi_weapon_array_train)

RandomForestClassifier(n_estimators=200, random_state=2)

### get feature importance

In [50]:
ptd_liwc_sia_train.shape

(2771, 19)

In [51]:
rfc.feature_importances_.shape

(19,)

In [52]:
input_feature_name_list = ['T%s' % str(i).zfill(2) for i in range(num_topics)] + liwc_list + sia_list
input_feature_name_list

['T00',
 'T01',
 'T02',
 'T03',
 'T04',
 'T05',
 'T06',
 'T07',
 'T08',
 'T09',
 'anx_liwc',
 'anger_liwc',
 'sad_liwc',
 'reward_liwc',
 'risk_liwc',
 'time_liwc',
 'money_liwc',
 'pos_sia',
 'neg_sia']

In [53]:
input_feature_importance_list = list(rfc.feature_importances_)
input_feature_importance_list

[0.06148744417306282,
 0.07321419620153667,
 0.062055710565359336,
 0.05416401476236872,
 0.05916832639556227,
 0.0585751691975027,
 0.05592424739567315,
 0.05970556648233258,
 0.06511103825749094,
 0.05777441341494983,
 0.012421629596441629,
 0.028445633007357037,
 0.017950349275008923,
 0.04684855769630377,
 0.03710621085345177,
 0.06127891440356929,
 0.04716704583499552,
 0.07115103307862254,
 0.07045049940841043]

In [54]:
input_feature_importance_dict = {}
input_feature_importance_dict['feature_name'] = input_feature_name_list
input_feature_importance_dict['feature_importance'] = input_feature_importance_list


In [55]:
input_feature_importance_df = pd.DataFrame(input_feature_importance_dict)

In [56]:
input_feature_importance_df.shape

(19, 2)

In [57]:
top_feature_df = input_feature_importance_df.sort_values(by='feature_importance', ascending=0)[:10].copy()
top_feature_df

Unnamed: 0,feature_name,feature_importance
1,T01,0.073214
17,pos_sia,0.071151
18,neg_sia,0.07045
8,T08,0.065111
2,T02,0.062056
0,T00,0.061487
15,time_liwc,0.061279
7,T07,0.059706
4,T04,0.059168
5,T05,0.058575


In [58]:
top_feature_df.feature_name.values

array(['T01', 'pos_sia', 'neg_sia', 'T08', 'T02', 'T00', 'time_liwc',
       'T07', 'T04', 'T05'], dtype=object)

In [59]:
important_topic_id_list = [int(i[1:]) for i in top_feature_df.feature_name.values if len(i) == 3]
important_topic_id_list

[1, 8, 2, 0, 7, 4, 5]

### pick some topics

In [60]:
for tmp_topic_id in range(num_topics):
    if tmp_topic_id not in important_topic_id_list:
        continue
    print('*'*40)
    print('Topic %s' % tmp_topic_id)
    top_n_pwt_list = lda.get_topic_terms(tmp_topic_id, topn=20)
    for tmp_pwt in top_n_pwt_list:
        print(dictionary[tmp_pwt[0]], tmp_pwt[1])

****************************************
Topic 0
said 0.009896377
report 0.008440824
year 0.004845418
trump 0.004435435
account 0.0041735223
compani 0.0040721823
sale 0.003577772
new 0.0035650395
first 0.0034793408
servic 0.0034790488
news 0.0032338854
one 0.003182733
survey 0.0030126034
show 0.0029507533
presid 0.0028628297
million 0.0028590895
state 0.002791546
journal 0.0026937404
store 0.0026886943
part 0.0026560966
****************************************
Topic 1
account 0.009373335
bank 0.0051509165
secur 0.0049540694
time 0.004919784
said 0.0043909126
onlin 0.004282199
report 0.004274663
new 0.0040234304
email 0.0038183632
year 0.0038080055
click 0.0036952791
cruz 0.0036839554
access 0.0036344698
state 0.003568444
date 0.0031724784
follow 0.0031459776
say 0.003136216
facebook 0.003045704
one 0.0029995223
trump 0.002931519
****************************************
Topic 2
black 0.0075292503
said 0.0073816837
new 0.0070563396
peopl 0.0069066333
one 0.006526005
trump 0.00617128
poli

In [61]:
for tmp_doc in doc_array:
    if 'font' in tmp_doc:
        print(tmp_doc)

['date', 'octob', 'repli', 'norepli', 'wellsfargo', 'com', 'import', 'messag', 'well', 'fargo', 'bank', 'well', 'fargo', 'constantli', 'protect', 'account', 'old', 'font', 'famili', 'verdana', 'color', 'thank', 'patienc', 'work', 'togeth', 'protect', 'account', 'sincer', 'well', 'fargo', 'bank', 'import', 'new', 'fdic', 'insur', 'rule', 'effect', 'decemb', 'decemb', 'well', 'fargo', 'right', 'reserv', 'nmlsr', 'equal', 'hous', 'lender']
['add', 'text', 'photoshop', 'adjust', 'font', 'color', 'add', 'text', 'photoshop', 'step', 'along', 'adjust', 'color', 'font', 'text', 'may', 'use', 'add', 'text', 'imag', 'photoshop', 'creat', 'poster', 'want', 'label', 'photo', 'visit', 'homepag', 'stori', 'check', 'product', 'mention', 'articl', 'adob', 'photoshop', 'best', 'buy', 'macbook', 'pro', 'best', 'buy', 'add', 'text', 'photoshop', 'relat', 'coverag', 'fromhow', 'everyth', 'tech', 'undo', 'photoshop', 'depend', 'version', 'photoshop', 'havehow', 'rotat', 'imag', 'photoshop', 'simpl', 'steps

## test

In [62]:
1

1