In [1]:
import nltk
from nltk.corpus import stopwords
import gensim as g
import csv
import sys
import string
import os
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [8]:
# Prepare stopwords
custom_puncstops = ["''", "``", "--"]
puncstops = [c for c in string.punctuation]
puncstops.extend(custom_puncstops)

#custom_stops = ["'s", 'q', 'israel', 'israeli', 'palestinian', 'palestinians']
custom_stops = ['template', "'s", 'ref', '/ref', 'url=http', 'http', 'name=', 'c', 'harvnb']

stops = stopwords.words('english')[:]
stops.extend(puncstops)
stops.extend(custom_stops)
len(stops)

171

In [35]:
# Import and prepare the full Wikipedia latest edition corpus

csv.field_size_limit(sys.maxsize)

corpus_csvpath = '/home/michael/school/cprose_research/wp/wikipedia/data/ipc_articles_latest.tsv'
wp_docs = []
with open(corpus_csvpath) as corpus_csv:
    reader = csv.reader(corpus_csv, delimiter='\t', quoting=csv.QUOTE_NONE)
    #headings = next(reader)
    for line in reader:
        text = line[2]
        
        # Tokenize
        doctoks = nltk.word_tokenize(text)
        
        # Make lowercase
        lowtoks = [tok.lower() for tok in doctoks]
        
        # Remove punctuation and stopwords
        cleantoks = [tok for tok in lowtoks if not tok in stops]
        
        wp_docs.append(cleantoks)
    
# Create Gensim dictionary (words to IDs)
wp_dict = g.corpora.Dictionary(wp_docs)
len(wp_docs)

2067

In [6]:
# Create vectors for each issue
wp_docs_vecs = [wp_dict.doc2bow(wp_doc) for wp_doc in wp_docs]
len(wp_docs_vecs)

2067

In [8]:
# k = 5, 100 passes
# Create LDA model (batch)

k = 5
lda = g.models.ldamodel.LdaModel(wp_docs_vecs, id2word=wp_dict, num_topics=k, passes=100, update_every=0)
lda.save("lda{:d}.model".format(k))
print("\n\n".join(['{:d}: {:s}'.format(i, wds) for (i, wds) in lda.print_topics(k, num_words=20)]))

0: 0.007*jewish + 0.005*temple + 0.004*hebrew + 0.003*judaism + 0.003*also + 0.003*rabbi + 0.003*one + 0.003*first + 0.003*new + 0.003*god + 0.003*jews + 0.003*jerusalem + 0.002*israel + 0.002*torah + 0.002*religious + 0.002*mount + 0.002*day + 0.002*many + 0.002*orthodox + 0.002*talmud

1: 0.010*palestine + 0.008*village + 0.006*1948 + 0.006*p. + 0.005*cite_book + 0.004*palestinian + 0.004*arab + 0.003*city + 0.003*land + 0.003*villages + 0.003*jerusalem + 0.003*also + 0.003*population + 0.003*area + 0.003*war + 0.003*mosque + 0.003*century + 0.002*site + 0.002*depopulated + 0.002*press

2: 0.009*israel + 0.007*palestinian + 0.007*palestine + 0.007*arab + 0.007*jewish + 0.005*state + 0.004*jews + 0.004*jerusalem + 0.004*united + 0.003*israeli + 0.003*war + 0.003*international + 0.003*east + 0.003*cite_web + 0.003*states + 0.003*would + 0.003*nations + 0.003*british + 0.002*also + 0.002*university

3: 0.010*israel + 0.010*israeli + 0.005*cite_news + 0.005*palestinian + 0.005*gaza + 0.0

In [10]:
# k = 7, 100 passes
# Create LDA model (batch)

k = 7
lda = g.models.ldamodel.LdaModel(wp_docs_vecs, id2word=wp_dictionary, num_topics=k, passes=100, update_every=0)
lda.save("lda{:d}.model".format(k))
print("\n\n".join(['{:d}: {:s}'.format(i, wds) for (i, wds) in lda.print_topics(k, num_words=20)]))

0: 0.010*israeli + 0.009*israel + 0.008*war + 0.005*forces + 0.004*killed + 0.004*arab + 0.004*military + 0.004*operation + 0.004*egyptian + 0.004*attack + 0.003*p. + 0.003*army + 0.003*idf + 0.003*force + 0.003*air + 0.003*egypt + 0.003*1948 + 0.003*two + 0.003*jordan + 0.003*october

1: 0.011*israel + 0.008*israeli + 0.007*palestinian + 0.007*cite_news + 0.006*gaza + 0.005*2011 + 0.005*2010 + 0.005*cite_web + 0.005*2012 + 0.004*june + 0.004*hamas + 0.004*2009 + 0.004*2008 + 0.003*international + 0.003*march + 0.003*january + 0.003*2006 + 0.003*rights + 0.003*said + 0.003*news

2: 0.006*war + 0.004*p. + 0.004*soviet + 0.004*german + 0.003*isbn + 0.003*germany + 0.003*university + 0.003*british + 0.002*new + 0.002*world + 0.002*states + 0.002*– + 0.002*also + 0.002*press + 0.002*united + 0.002*would + 0.002*pp + 0.002*first + 0.002*history + 0.002*one

3: 0.008*lebanon + 0.005*sfn + 0.005*lebanese + 0.005*syria + 0.004*syrian + 0.004*war + 0.004*plo + 0.003*military + 0.003*party + 0.0

In [8]:
# k = 10, 100 passes
# Create LDA model (batch)

k = 10
lda10 = g.models.ldamodel.LdaModel(wp_docs_vecs, id2word=wp_dictionary, num_topics=k, passes=100, update_every=0)
lda10.save("lda10.model")
print("\n\n".join(['{:d}: {:s}'.format(i, wds) for (i, wds) in lda10.print_topics(k, num_words=20)]))

0.019*arab + 0.014*palestine + 0.010*british + 0.008*jewish + 0.006*palestinian + 0.006*arabs + 0.005*water + 0.005*jordan + 0.005*isbn + 0.004*war + 0.004*jews + 0.004*league + 0.004*mandate + 0.003*first + 0.003*year + 0.003*syria + 0.003*zionist + 0.003*1948 + 0.003*cite_book + 0.003*title

0.011*2008 + 0.010*december + 0.006*bush + 0.003*https + 0.003*balfour + 0.003*pakistan + 0.003*ref=harv + 0.003*byzantine + 0.002*cite_news + 0.002*greek + 0.002*police + 0.002*archivedate= + 0.002*empire + 0.002*deadurl= + 0.002*archiveurl= + 0.002*george + 0.002*eastern + 0.002*soviet + 0.002*bangladesh + 0.002*language=greek

0.005*jewish + 0.005*jews + 0.004*german + 0.003*new + 0.003*also + 0.003*war + 0.003*germany + 0.003*one + 0.002*flag + 0.002*poland + 0.002*soviet + 0.002*god + 0.002*many + 0.002*first + 0.002*world + 0.002*day + 0.002*ethnic + 0.002*cite_book + 0.002*– + 0.002*adenauer

0.017*palestine + 0.014*village + 0.013*1948 + 0.010*p. + 0.007*arab + 0.006*palestinian + 0.006*v

In [6]:
# k = 15, 100 passes
# Create LDA model (batch)
lda15 = g.models.ldamodel.LdaModel(docs_vecs, id2word=wp_dictionary, num_topics=15, passes=100, update_every=0)
lda15.save("lda15.model")
print("\n\n".join(["{:d}: {:s}".format(i, wds) for (i, wds) in lda15.print_topics(15, num_words=20)]))

0.019*minister + 0.014*israel + 0.014*party + 0.014*israeli + 0.009*prime + 0.008*knesset + 0.006*election + 0.006*government + 0.005*likud + 0.004*death + 0.004*date + 0.004*birth + 0.004*labor + 0.004*legislative + 0.004*elections + 0.004*served + 0.004*place + 0.004*yitzhak + 0.004*netanyahu + 0.003*tel

0.011*cite_web + 0.011*flag + 0.008*2011 + 0.008*united + 0.007*nations + 0.007*palestine + 0.007*state + 0.006*states + 0.006*arab + 0.005*cite_news + 0.005*relations + 0.005*2012 + 0.005*foreign + 0.005*un + 0.005*2010 + 0.004*international + 0.004*september + 0.004*republic + 0.004*rights + 0.004*flagicon

0.008*air + 0.006*aircraft + 0.005*war + 0.005*used + 0.004*mm + 0.004*force + 0.004*service + 0.004*soviet + 0.004*vehicle + 0.003*also + 0.003*tank + 0.003*two + 0.003*mirage + 0.003*gun + 0.003*army + 0.003*p. + 0.003*– + 0.003*one + 0.003*combat + 0.003*first

0.009*jewish + 0.008*jews + 0.004*sfn + 0.003*also + 0.002*time + 0.002*many + 0.002*cite_book + 0.002*one + 0.002*

In [2]:
# Load LDA model 15
lda15 = g.models.ldamodel.LdaModel.load("lda15.model", mmap='r')
print("\n\n".join(["{:d}: {:s}".format(i, wds) for (i, wds) in lda15.print_topics(15, num_words=20)]))

0: 0.015*lebanon + 0.013*water + 0.009*lebanese + 0.005*obama + 0.005*river + 0.005*jordan + 0.004*syria + 0.004*democracy + 0.004*beirut + 0.004*syrian + 0.004*plo + 0.004*haifa + 0.003*2008 + 0.003*cite_web + 0.003*damascus + 0.003*city + 0.003*2009 + 0.002*1982 + 0.002*south + 0.002*nidal

1: 0.008*jerusalem + 0.008*jewish + 0.006*city + 0.006*temple + 0.004*jews + 0.004*hebrew + 0.004*century + 0.004*also + 0.004*first + 0.003*history + 0.003*new + 0.003*holy + 0.003*mount + 0.003*one + 0.003*israel + 0.003*god + 0.003*christian + 0.003*ancient + 0.003*church + 0.003*roman

2: 0.010*israel + 0.010*bank + 0.010*west + 0.009*israeli + 0.008*cite_web + 0.006*jews + 0.006*barrier + 0.006*palestinian + 0.005*jewish + 0.005*settlements + 0.005*population + 0.004*2012 + 0.004*apartheid + 0.004*settlement + 0.003*land + 0.003*gaza + 0.003*jerusalem + 0.003*palestinians + 0.003*international + 0.003*2009

3: 0.013*israel + 0.011*palestinian + 0.007*arab + 0.007*palestine + 0.006*jewish + 0.

In [3]:
lda15

<gensim.models.ldamodel.LdaModel at 0x7f663529a208>

In [3]:
# Select issue corpus from Bitterlemons

# Extract topics
metadir = '/home/michael/school/cprose_research/bitterlemons_corpus_1.0/meta/'
topics = []
topic_fnames = defaultdict(list)

for f in os.listdir(metadir):
    if not any(s in f for s in ['orig', '~']):
        fpath = os.path.join(metadir, f)
        with open(fpath, 'r') as tsv:
            reader = csv.reader(tsv, delimiter='\t')
            first_line = next(reader)
            topic = first_line[4]
            topics.append(topic)
            topic_fnames[topic].append(f)

len(topics)

594

In [4]:
# Map topics to issues
issuespath = '/home/michael/school/cprose_research/bitterlemons_corpus_1.0/bl_issues.csv'
issues = defaultdict(list) # { issue: [topics]}
issues_fnames = defaultdict(list)

with open(issuespath, 'r') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        topic = row[0]
        issue = row[1]
        issues[issue].append(topic)
        issues_fnames[issue] += topic_fnames[topic]

print(issues.keys())
print(len(issues_fnames.values()))

dict_keys(['one-state and two-state solutions', 'refugees and the Right of Return', 'peace process', 'separation barrier', 'Israeli Arabs', 'occupation', 'Israeli settlements', 'Palestinian statehood', 'direct conflict', 'internal politics', "citizens' life", 'Gaza', 'international role', 'Jerusalem', 'US role'])
15


In [5]:
# Get texts by issue
docspath = '/home/michael/school/cprose_research/bitterlemons_corpus_1.0/docs/'
issues_texts = defaultdict(list) # {issue: [(fname, text)]}

for issue in list(issues_fnames.keys()):
    for fname in issues_fnames[issue]:
        docpath = os.path.join(docspath, fname)
        with open(docpath, 'r') as f:
            doc = f.read().replace('\n', ' ')
        issues_texts[issue].append((fname, doc))
    
len(issues_texts.keys())

15

In [6]:
# Aggregate texts per issue
issues_textlist = []
issues_titles = []
for issue in issues_texts.keys():
    fulltext = ""
    for fname, text in issues_texts[issue]:
        fulltext += text + ' '
    issues_textlist.append(fulltext)
    issues_titles.append(issue)

print(len(issues_textlist))

15


In [9]:
# Tokenize issues
issues_toks = []

for issuetext in issues_textlist:
    doctoks = nltk.word_tokenize(issuetext)

    # Make lowercase
    lowtoks = [tok.lower() for tok in doctoks]

    # Remove punctuation and stopwords
    cleantoks = [tok for tok in lowtoks if not tok in stops]
    
    issues_toks.append(cleantoks)
    
len(issues_toks)

15

In [39]:
issues_toks.append(['settlement','apartheid'])
len(issues_toks)

16

In [40]:
# Create Gensim dictionary (words to IDs)
bl_dictionary = g.corpora.Dictionary(issues_toks)

# Create vectors for each issue
issues_vecs = [bl_dictionary.doc2bow(issue) for issue in issues_toks]
len(issues_vecs)

16

In [41]:
lda15_issues = lda15[issues_vecs]
type(lda15_issues)

gensim.interfaces.TransformedCorpus

In [12]:
lda15_issues.corpus

[[(0, 1),
  (1, 1),
  (2, 7),
  (3, 1),
  (4, 1),
  (5, 2),
  (6, 14),
  (7, 1),
  (8, 17),
  (9, 1),
  (10, 9),
  (11, 4),
  (12, 2),
  (13, 2),
  (14, 2),
  (15, 1),
  (16, 3),
  (17, 1),
  (18, 10),
  (19, 1),
  (20, 5),
  (21, 1),
  (22, 2),
  (23, 2),
  (24, 8),
  (25, 1),
  (26, 3),
  (27, 2),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 3),
  (33, 3),
  (34, 26),
  (35, 5),
  (36, 1),
  (37, 2),
  (38, 2),
  (39, 136),
  (40, 3),
  (41, 1),
  (42, 1),
  (43, 3),
  (44, 4),
  (45, 4),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 3),
  (52, 3),
  (53, 6),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 1),
  (59, 8),
  (60, 1),
  (61, 2),
  (62, 2),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 2),
  (67, 2),
  (68, 1),
  (69, 2),
  (70, 3),
  (71, 2),
  (72, 1),
  (73, 1),
  (74, 3),
  (75, 1),
  (76, 3),
  (77, 2),
  (78, 2),
  (79, 4),
  (80, 2),
  (81, 2),
  (82, 5),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 5),
  (89, 1),
  (90, 1),
  (

In [42]:
#testvec = bl_dictionary.doc2bow([['settlement']])
lda15_test = lda15.inference(issues_vecs, collect_sstats=True)
lda15_test[1]

array([[  5.98952016e-17,   1.30236212e-18,   1.12715779e-14, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.78275302e-09,   1.18839315e-02,   4.75037105e-07, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.77814930e-09,   3.51285436e-11,   2.86591771e-07, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  6.36944413e-08,   1.20514307e-01,   6.30373013e-06, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.35273859e-07,   1.47944265e+00,   1.19999788e+02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.14905644e-06,   1.56095188e-08,   1.25955880e-04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [43]:
lda15_test[1].shape # The e-step sufficient statistics

(15, 283053)

In [27]:
sstats = lda15_test[1]

# Word probabilites for topic 1, presumably
sstats[0].shape

(283053,)

In [34]:
len(bl_dictionary.keys())

15481

In [36]:
len(wp_dict.keys())

283052

In [37]:
# Indices of maximum values--don't seem to correspond with topics
for i in range(15):
    print(i, wp_dict[np.argmax(sstats[i])])

0 breakfast
1 pages=116
2 gulf
3 hp
4 hatzerim
5 furthermore
6 sole
7 saud
8 china
9 al-bashir
10 hatzerim
11 brazil
12 kfir
13 iiicj
14 public


In [38]:
sstats.ndim

2

In [14]:
# Infer topic distros on Bitterlemons issue texts
#test_issue = issues_textlist[0]
lda15_issues = lda15[issues_vecs]

for i, issue_distro in enumerate(lda15_issues):
    print("Issue", i, issues_titles[i])
    print('\n'.join(["{:d}: {:f}".format(n, distro) for (n, distro) in reversed(sorted(issue_distro, key=lambda x: x[1]))]))
    print('\n')

Issue 0 separation barrier
14: 0.352426
3: 0.172199
11: 0.149791
10: 0.098425
4: 0.095927
12: 0.045832
9: 0.031542
13: 0.027314
8: 0.013963


Issue 1 peace process
14: 0.244802
8: 0.185228
3: 0.178315
10: 0.098794
11: 0.093790
4: 0.079477
9: 0.042592
12: 0.025356
13: 0.020514
1: 0.019445


Issue 2 Israeli Arabs
14: 0.268984
3: 0.161434
11: 0.130503
8: 0.130265
4: 0.106140
10: 0.059754
9: 0.038321
13: 0.032359
1: 0.027866
6: 0.022783
12: 0.019397


Issue 3 one-state and two-state solutions
14: 0.281875
3: 0.219360
11: 0.109051
8: 0.107575
4: 0.085098
10: 0.079848
9: 0.032198
13: 0.024184
6: 0.024174
12: 0.022090
1: 0.012803


Issue 4 US role
14: 0.257196
8: 0.166397
3: 0.157499
11: 0.097127
4: 0.090649
10: 0.087189
9: 0.051172
12: 0.033832
13: 0.025352
6: 0.019357


Issue 5 international role
14: 0.252773
3: 0.165445
8: 0.145954
4: 0.103146
10: 0.091966
11: 0.082816
9: 0.078346
13: 0.029574
12: 0.026144
6: 0.011246
1: 0.010244


Issue 6 Gaza
14: 0.266981
3: 0.145705
8: 0.133976
4: 0.102