# Jupyter Playground

*Jupyter notebook for testing purposes.*

In [1]:
import sys

sys.path.insert(0, '../scripts')
from tokenizer import Tokenizer
from lemmatizer import Lemmatizer
from topicizer import Topicizer
from factorizer import Factorizer

## 1. Tokenizer

In [2]:
s = '"Siemens is building a factory in India for approximately 7.3 Million Euros."'
t = Tokenizer(s)
t._print_noun_chunks()

Siemens
a factory
India
approximately 7.3 Million Euros


## 2. Lemmatizer

In [3]:
seq = 'I am a student studying various Natural Language Processing algorothms to obtain a deeper understanding.'
l = Lemmatizer()
l._print_lemmas(seq)

I            PRON   561228191312463089   -PRON-
am           AUX    10382539506755952630 be
a            DET    11901859001352538922 a
student      NOUN   8980609300697527695  student
studying     VERB   4251533498015236010  study
various      ADJ    15505458595570984623 various
Natural      PROPN  16016219431787577165 Natural
Language     PROPN  2953755589855585080  Language
Processing   PROPN  5477194979658911992  Processing
algorothms   NOUN   3969165751382591598  algorothms
to           PART   3791531372978436496  to
obtain       VERB   8991116519625795538  obtain
a            DET    11901859001352538922 a
deeper       ADJ    12691978708603459222 deep
understanding NOUN   1392792401359955612  understanding
.            PUNCT  12646065887601541794 .


## 3. Topic Modeling
- LDA
- Matrix Factorization

In [4]:
# LDA
t = Topicizer(os.getcwd().replace('/notebooks', '/data/articles.csv'), classes=10)
lda_cluster_data = t.get_topk(top_k=15)

Topic #0
['did', 'election', 'told', 'russia', 'donald', 'new', 'news', 'obama', 'white', 'house', 'campaign', 'clinton', 'president', 'said', 'trump']


Topic #1
['oil', 'million', 'according', 'deal', 'iran', 'countries', 'year', 'new', 'business', 'world', 'trade', 'said', 'companies', 'says', 'company']


Topic #2
['don', 'years', 'life', 'know', 'man', 'day', 'time', 'black', 'city', 'like', 'just', 'said', 'police', 'people', 'says']


Topic #3
['just', 'year', 'university', 'years', 'research', 'disease', 'food', 'like', 'care', 'percent', 'study', 'new', 'people', 'health', 'says']


Topic #4
['lot', 'things', 'going', 'way', 'time', 'don', 'know', 'really', 'students', 'think', 'just', 'school', 'like', 'people', 'says']


Topic #5
['republican', 'voters', 'vote', 'clinton', 'new', 'federal', 'president', 'states', 'people', 'court', 'law', 'percent', 'trump', 'state', 'said']


Topic #6
['won', 'way', 'games', 'life', 'earth', 'game', 'space', 'like', 'just', 'years', 'says',

In [5]:
# Articles with assigned topic
print(lda_cluster_data)

Article  Topic
0      In the Washington of 2016, even when the polic...      0
1        Donald Trump has used Twitter  —   his prefe...      0
2        Donald Trump is unabashedly praising Russian...      0
3      Updated at 2:50 p. m. ET, Russian President Vl...      0
4      From photography, illustration and video, to d...      5
...                                                  ...    ...
11987  The number of law enforcement officers shot an...      2
11988    Trump is busy these days with victory tours,...      0
11989  It’s always interesting for the Goats and Soda...      3
11990  The election of Donald Trump was a surprise to...      5
11991  Voters in the English city of Sunderland did s...      1

[11992 rows x 2 columns]


### Non-negative Matrix Factorization
Perform SVD to obtain topics aims to minimize the *Frobenius Norm*:
$\frac{1}{2} || A - WH||^2_F = \sum \sum \left( A_{ij} - (WH)_{ij }\right)^2$

In [6]:
f = Factorizer(os.getcwd().replace('/notebooks', '/data/articles.csv'), classes=10)
svd_cluster_data = f.get_topk(top_k=15)

Topic #0
['year', 'university', 'workers', '000', 'years', 'just', 'company', 'study', 'new', 'percent', 'like', 'water', 'food', 'people', 'says']


Topic #1
['administration', 'cruz', 'election', 'pence', 'gop', 'presidential', 'obama', 'house', 'white', 'republican', 'donald', 'campaign', 'said', 'president', 'trump']


Topic #2
['patients', 'repeal', 'law', 'act', 'republicans', 'tax', 'people', 'plan', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


Topic #3
['assad', 'iran', 'iraq', 'north', 'china', 'aleppo', 'war', 'korea', 'said', 'forces', 'russia', 'military', 'syrian', 'syria', 'isis']


Topic #4
['cruz', 'election', 'primary', 'democrats', 'percent', 'party', 'vote', 'state', 'delegates', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


Topic #5
['book', 'love', 'women', 'way', 'time', 'life', 'album', 'song', 'people', 'really', 'know', 'think', 'just', 'like', 'music']


Topic #6
['program', 'child', 'teacher', 'h

In [7]:
print(svd_cluster_data)

Article  Topic
0      In the Washington of 2016, even when the polic...      1
1        Donald Trump has used Twitter  —   his prefe...      1
2        Donald Trump is unabashedly praising Russian...      1
3      Updated at 2:50 p. m. ET, Russian President Vl...      9
4      From photography, illustration and video, to d...      6
...                                                  ...    ...
11987  The number of law enforcement officers shot an...      8
11988    Trump is busy these days with victory tours,...      1
11989  It’s always interesting for the Goats and Soda...      7
11990  The election of Donald Trump was a surprise to...      4
11991  Voters in the English city of Sunderland did s...      0

[11992 rows x 2 columns]
