# Text 3: Latent Dirichlet allocation
**Internet Analytics - Lab 4**

---

**Group:** *O*

**Names:**

* *Argelaguet Franquelo, Pau*
* *du Bois de Dunilac, Vivien*

---

#### Instructions

*This is a template for part 3 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [19]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

import json
import pickle
import numpy as np
import string
import collections
import operator
import math

from functools import reduce
from scipy.sparse import csr_matrix
from utils import load_json, load_pkl, save_pkl

In [3]:
courses = load_json('data/courses.txt')
preprocessed = load_pkl('data/preprocess.pckl')
terms = load_pkl('data/terms.pckl')

In [72]:
mat = load_pkl('data/mat.pckl')
doc = load_pkl('data/documents.pckl')
stopwords = load_pkl('data/stopwords.pkl')

## Exercise 4.8: Topics extraction

In [5]:
print(list(list(preprocessed.items())[0][1]['description'].items()))

[('adapt', 2), ('adapt composit', 2), ('applic', 2), ('biocomposit', 2), ('composit', 12), ('composit applic', 2), ('cost', 2), ('develop', 2), ('nanocomposit', 2), ('perform', 2), ('polym', 2), ('product', 3), ('team', 2)]


In [6]:
def createTermVector(bagOfWord):
    countList = []
    for term in terms:
        countList.append(bagOfWord.get(term, 0))
    return Vectors.dense(countList)

In [7]:
vectorList = []
counter = 1
for it in list(preprocessed.items()):
    vectorList.append([counter, createTermVector(it[1]['description'])])
    counter += 1
rdd = sc.parallelize(vectorList)

In [8]:
def pTop(ldaModel, nWords=10):
    for topic in ldaModel.describeTopics():
        termList = []
        for i in range(nWords):
            termList.append(terms[topic[0][i]])
        print(termList)

### Variying ALPHA

In [93]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=1.01, docConcentration=1.01)
pTop(ldaModel)

['heat', 'communic', 'chemic', 'electron', 'applic', 'reaction', 'algorithm', 'linear', 'comput', 'flow']
['energi', 'report', 'skill', 'evalu', 'data', 'plan', 'convers', 'comput', 'week', 'technolog']
['physic', 'equat', 'problem', 'mass', 'properti', 'structur', 'group', 'applic', 'solv', 'mechan']
['energi', 'paper', 'engin', 'discuss', 'industri', 'busi', 'cell', 'assess', 'team', 'technolog']
['problem', 'program', 'comput', 'optim', 'numer', 'research', 'linear', 'algorithm', 'plan', 'signal']
['data', 'network', 'robot', 'control', 'algorithm', 'magnet', 'water', 'assess', 'exam', 'architectur']
['electron', 'stochast', 'financi', 'control', 'architectur', 'applic', 'function', 'comput', 'linear', 'theori']
['simul', 'studi', 'case', 'risk', 'data', 'manag', 'assess', 'case studi', 'mechan', 'activ']
['biolog', 'chemic', 'organ', 'structur', 'chemistri', 'develop', 'reaction', 'scientif', 'evalu', 'molecular']
['optic', 'microscopi', 'electron', 'mechan', 'quantum', 'spectrosco

In [85]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=1.01, docConcentration=3.0)
pTop(ldaModel)

['report', 'flow', 'linear', 'problem', 'skill', 'numer', 'data', 'scientif', 'optim', 'experiment']
['electron', 'quantum', 'theori', 'cell', 'microscopi', 'applic', 'magnet', 'structur', 'introduct', 'risk']
['circuit', 'architectur', 'devic', 'field', 'sensor', 'activ', 'signal', 'robot', 'integr', 'techniqu']
['optic', 'biolog', 'paper', 'laser', 'discuss', 'chemic', 'protein', 'physic', 'chemistri', 'interact']
['case', 'studi', 'manag', 'cell', 'power', 'applic', 'case studi', 'engin', 'group', 'wast']
['energi', 'engin', 'thermodynam', 'numer', 'physic', 'dure', 'convers', 'environment', 'comput', 'technolog']
['control', 'mechan', 'structur', 'properti', 'function', 'fundament', 'fractur', 'measur', 'problem', 'statist']
['organ', 'structur', 'engin', 'machin', 'network', 'properti', 'transport', 'physic', 'program', 'metal']
['data', 'develop', 'week', 'innov', 'research', 'technolog', 'plan', 'assess', 'evalu', 'differ']
['imag', 'stochast', 'theori', 'probabl', 'introduct', 

In [86]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=1.01, docConcentration=5.0)
pTop(ldaModel)

['statist', 'program', 'data', 'linear', 'optim', 'problem', 'algorithm', 'theori', 'comput', 'function']
['control', 'engin', 'circuit', 'robot', 'filter', 'oper', 'week', 'power', 'imag', 'integr']
['paper', 'electron', 'control', 'field', 'data', 'water', 'discuss', 'solid', 'energi', 'devic']
['report', 'skill', 'laser', 'research', 'quantum', 'evalu', 'laboratori', 'data', 'activ', 'organ']
['simul', 'market', 'price', 'flow', 'scientif', 'architectur', 'develop', 'research', 'theori', 'physic']
['optic', 'energi', 'fourier', 'risk', 'solv', 'algebra', 'imag', 'physic', 'problem', 'measur']
['mechan', 'magnet', 'applic', 'structur', 'properti', 'physic', 'organ', 'metal', 'communic', 'electron']
['stochast', 'numer', 'reactor', 'architectur', 'develop', 'visual', 'final', 'imag', 'assess', 'urban']
['biolog', 'cell', 'protein', 'signal', 'structur', 'molecular', 'optic', 'dynam', 'applic', 'note']
['studi', 'case', 'chemic', 'assess', 'group', 'case studi', 'reaction', 'class', 'p

In [87]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=1.01, docConcentration=7.5)
pTop(ldaModel)

['electron', 'sensor', 'techniqu', 'applic', 'product', 'industri', 'communic', 'devic', 'principl', 'technolog']
['linear', 'statist', 'probabl', 'introduct', 'stochast', 'price', 'financi', 'imag', 'measur', 'market']
['paper', 'teach', 'scienc', 'space', 'theori', 'architectur', 'comput', 'research', 'group', 'scale']
['biolog', 'field', 'engin', 'structur', 'problem', 'assess', 'physic', 'research', 'skill', 'energi']
['optic', 'structur', 'mechan', 'quantum', 'laser', 'molecular', 'protein', 'function', 'statist', 'applic']
['energi', 'control', 'optim', 'program', 'data', 'risk', 'network', 'engin', 'communic', 'power']
['case', 'imag', 'studi', 'integr', 'function', 'group', 'activ', 'topic', 'fourier', 'circuit']
['chemic', 'flow', 'reaction', 'equat', 'chemistri', 'magnet', 'heat', 'physic', 'thermodynam', 'transfer']
['report', 'data', 'experiment', 'week', 'skill', 'scientif', 'laboratori', 'robot', 'written', 'stabil']
['develop', 'semest', 'innov', 'architectur', 'numer', 

In [88]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=1.01, docConcentration=10.0)
pTop(ldaModel)

['optic', 'laser', 'reaction', 'studi', 'theori', 'microscopi', 'problem', 'reactor', 'evalu', 'measur']
['data', 'imag', 'program', 'comput', 'statist', 'algorithm', 'signal', 'final', 'practic', 'visual']
['electron', 'communic', 'simul', 'robot', 'energi', 'flow', 'equat', 'introduct', 'circuit', 'price']
['biolog', 'protein', 'comput', 'molecular', 'interact', 'quantum', 'architectur', 'function', 'dynam', 'theori']
['numer', 'engin', 'magnet', 'teach', 'equat', 'mathemat', 'probabl', 'biolog', 'practic', 'properti']
['paper', 'semest', 'discuss', 'product', 'assess', 'environment', 'econom', 'research', 'innov', 'studi']
['report', 'evalu', 'develop', 'research', 'scientif', 'data', 'problem', 'experiment', 'optim', 'skill']
['cell', 'control', 'chemic', 'energi', 'metal', 'thermodynam', 'mechan', 'organ', 'electron', 'convers']
['structur', 'energi', 'mechan', 'manag', 'control', 'group', 'risk', 'inform', 'assess', 'dynam']
['physic', 'field', 'devic', 'differ', 'industri', 'act

### Variying BETA

In [89]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=1.01, docConcentration=6.0)
pTop(ldaModel)

['structur', 'mechan', 'comput', 'quantum', 'numer', 'cell', 'dynam', 'simul', 'applic', 'solv']
['control', 'protein', 'biolog', 'imag', 'research', 'network', 'develop', 'activ', 'chemic', 'studi']
['report', 'problem', 'evalu', 'plan', 'data', 'skill', 'optim', 'scientif', 'assess', 'studi']
['flow', 'physic', 'practic', 'teach', 'skill', 'theori', 'test', 'equat', 'product', 'technolog']
['electron', 'devic', 'imag', 'microscopi', 'structur', 'techniqu', 'properti', 'chemistri', 'integr', 'organ']
['risk', 'manag', 'magnet', 'market', 'differ', 'financi', 'case', 'price', 'assess', 'introduct']
['engin', 'paper', 'field', 'biolog', 'discuss', 'reaction', 'communic', 'chemic', 'activ', 'circuit']
['energi', 'mass', 'industri', 'thermodynam', 'heat', 'environment', 'evalu', 'water', 'chemic', 'convers']
['statist', 'signal', 'linear', 'probabl', 'stochast', 'theori', 'data', 'robot', 'time', 'control']
['optic', 'data', 'cell', 'algorithm', 'laser', 'note', 'function', 'program', 'ap

In [90]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=3.0, docConcentration=6.0)
pTop(ldaModel)

['cell', 'biolog', 'applic', 'organ', 'chemic', 'physic', 'signal', 'function', 'reaction', 'activ']
['problem', 'numer', 'comput', 'linear', 'theori', 'optim', 'algorithm', 'program', 'statist', 'equat']
['optic', 'microscopi', 'electron', 'applic', 'imag', 'physic', 'principl', 'equat', 'control', 'theori']
['energi', 'heat', 'convers', 'thermodynam', 'architectur', 'magnet', 'transfer', 'physic', 'case', 'technolog']
['network', 'technolog', 'applic', 'theori', 'control', 'activ', 'communic', 'techniqu', 'imag', 'develop']
['data', 'report', 'research', 'evalu', 'scientif', 'skill', 'plan', 'assess', 'experiment', 'problem']
['structur', 'week', 'class', 'engin', 'control', 'assess', 'group', 'activ', 'develop', 'evalu']
['laser', 'quantum', 'physic', 'mechan', 'biolog', 'architectur', 'chemic', 'electron', 'engin', 'principl']
['electron', 'applic', 'theori', 'mechan', 'signal', 'structur', 'techniqu', 'introduct', 'devic', 'properti']
['imag', 'applic', 'physic', 'mechan', 'engin'

In [91]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=5.0, docConcentration=6.0)
pTop(ldaModel)

['data', 'problem', 'assess', 'activ', 'control', 'optic', 'evalu', 'comput', 'energi', 'applic']
['electron', 'energi', 'imag', 'applic', 'structur', 'physic', 'biolog', 'control', 'activ', 'mechan']
['problem', 'comput', 'data', 'assess', 'applic', 'evalu', 'activ', 'theori', 'structur', 'optim']
['biolog', 'structur', 'physic', 'mechan', 'applic', 'chemic', 'energi', 'assess', 'cell', 'engin']
['energi', 'applic', 'structur', 'data', 'optic', 'electron', 'theori', 'assess', 'problem', 'physic']
['structur', 'optic', 'problem', 'applic', 'electron', 'data', 'biolog', 'physic', 'theori', 'activ']
['data', 'engin', 'problem', 'structur', 'activ', 'theori', 'assess', 'applic', 'evalu', 'comput']
['electron', 'optic', 'energi', 'applic', 'structur', 'physic', 'mechan', 'control', 'theori', 'properti']
['structur', 'optic', 'problem', 'applic', 'comput', 'theori', 'biolog', 'physic', 'mechan', 'activ']
['report', 'data', 'evalu', 'research', 'assess', 'problem', 'technolog', 'skill', 'act

In [92]:
ldaModel = LDA.train(rdd, k=10, topicConcentration=7.0, docConcentration=6.0)
pTop(ldaModel)

['structur', 'electron', 'applic', 'data', 'problem', 'physic', 'mechan', 'energi', 'assess', 'activ']
['energi', 'data', 'problem', 'assess', 'structur', 'applic', 'evalu', 'activ', 'comput', 'theori']
['data', 'energi', 'evalu', 'assess', 'problem', 'activ', 'report', 'applic', 'structur', 'comput']
['optic', 'applic', 'problem', 'energi', 'activ', 'electron', 'assess', 'theori', 'structur', 'comput']
['data', 'optic', 'structur', 'applic', 'problem', 'theori', 'assess', 'activ', 'comput', 'evalu']
['structur', 'applic', 'problem', 'theori', 'assess', 'data', 'activ', 'comput', 'evalu', 'physic']
['problem', 'structur', 'applic', 'comput', 'theori', 'data', 'assess', 'activ', 'physic', 'mechan']
['structur', 'data', 'problem', 'assess', 'applic', 'engin', 'activ', 'evalu', 'theori', 'comput']
['data', 'applic', 'structur', 'problem', 'assess', 'engin', 'energi', 'activ', 'evalu', 'comput']
['assess', 'structur', 'evalu', 'problem', 'applic', 'data', 'activ', 'engin', 'theori', 'resea

### Optimized parameters

In [9]:
# k = number of topics, alpha(doc) higher alpha means more different topics per document
# beta(topic) higher beta means more words per topic
ldaModel = LDA.train(rdd, k=20, docConcentration=2.51, topicConcentration=1.61, seed=10)

# print the top x terms for each topic
pTop(ldaModel, 10)

['cell', 'properti', 'mechan', 'applic', 'magnet', 'structur', 'physic', 'function', 'molecular', 'part']
['stabil', 'measur', 'water', 'treatment', 'snow', 'physic', 'wastewat', 'atmospher', 'field', 'control']
['structur', 'manag', 'risk', 'case', 'assess', 'reactor', 'mechan', 'class', 'week', 'case studi']
['chemistri', 'reaction', 'equat', 'organ', 'chemic', 'differenti', 'synthesi', 'engin', 'stochast', 'food']
['data', 'secur', 'network', 'statist', 'seri', 'time', 'hour', 'time seri', 'practic', 'regress']
['report', 'control', 'data', 'skill', 'plan', 'evalu', 'optim', 'scientif', 'experiment', 'research']
['linear', 'signal', 'statist', 'communic', 'probabl', 'code', 'theori', 'signal process', 'filter', 'algorithm']
['busi', 'innov', 'group', 'problem', 'class', 'inform', 'market', 'set', 'theori', 'rate']
['optic', 'numer', 'principl', 'laser', 'physic', 'applic', 'sensor', 'comput', 'transfer', 'heat']
['research', 'network', 'social', 'data', 'understand', 'brain', 'funct

In [22]:
topicLabels = dict(zip(range(20), ['Cellular biology', 'Environment', 'Risk management', 'Chemistry', 'Data applications', 'Lab environment',
              'Signal processing', 'Business', 'Physics', 'Machine learning', 'Computer language processing',
              'Cancer', 'Micro/Nano-electronics', 'Materials', 'Thermodynamics', 'Architecture/Design',
              'Biology', 'Imagery', 'Electricity/Power', 'Applied science']))

In [10]:
def extractTopics(ldaModel, nWords=10):
    topicList = []
    for topic in ldaModel.describeTopics():
        termList = []
        for i in range(nWords):
            termList.append(terms[topic[0][i]])
        topicList.append(termList)
    return topicList

In [11]:
courseTopics = extractTopics(ldaModel, 25)
topicLabels = range(len(courseTopics))
#topicLabels = ['Imagery', 'Scientific methodology', 'Statistics', 'Biology/Chemistry', 'Signal/Image processing', 'Energy', 'Lab environment', 'Maths', 'Stochastic models', 'Programming', 'Business', 'Organization']

In [12]:
for course in preprocessed.items():
    topics = []
    for i in range(len(courseTopics)):
        count = 0
        t = courseTopics[i]
        for word in course[1]['description']:
            if word in t:
                count += 1
        if count > 4:
            topics.append(topicLabels[i])
    print(course[1]['name'], topics)

Composites technology []
Image Processing for Life Science []
Global business environment [2, 7, 19]
Electrochemical nano-bio-sensing and bio/CMOS interfaces []
Structural mechanics (for MT) []
Théorie et critique du projet MA2 (Boltshauser) [5, 19]
Advanced principles and applications of systems biology []
Mass spectrometry [16]
Principles of digital communications [6]
Hardware systems modeling I []
Quantitative systems modeling techniques []
Medical radiation physics [8]
Bio-nano-chip design []
Principles of powder and densification processing [13]
Fundamentals of solid-state materials []
Microeconomics [7]
Polymer chemistry and macromolecular engineering []
Physical chemistry of polymeric materials []
Fundamentals of neuroengineering []
In Silico neuroscience []
Materials selection [2, 7]
Optics laboratories I [5, 8, 9, 11, 19]
Fracture mechanics []
Air pollution and climate change
 []
Philosophy of life sciences I [2, 5, 7, 9, 10, 11, 19]
Lab methods : proteomics [4]
Cementitious m

In [13]:
tm = ldaModel.topicsMatrix()

In [44]:
# Compute a score for each topic/course pair
# The score is the sum of the bag of words' terms multiplied by their weight in the model's topic matrix
# normalized by the number of words
def computeTopicScores(topicsMatrix, bagOfWords):
    topicScores = np.zeros(topicsMatrix.shape[1])
    wordCount = 0
    for i in range(topicsMatrix.shape[0]):
        occ = bagOfWords.get(terms[i], 0)
        wordCount += occ
        for j in range(topicsMatrix.shape[1]):
            topicScores[j] += occ * topicsMatrix[i][j]
    
    return topicScores / wordCount

def associateTopicScores(topicsMatrix):
    res = list()
    for k,v in preprocessed.items():
        scores = computeTopicScores(topicsMatrix, v['description'])
        res.append((v['name'], sorted(list(zip(list(range(topicsMatrix.shape[1])), scores)), key=lambda x:x[1])))
        
    return res

def printCourse(course, thresh = 10):
    tList = []
    for tup in course[1]:
        if tup[1] > thresh:
            tList.append(tup)
    if len(tList) > 0:
        print(course[0], "->", list(map(lambda x: topicLabels[x[0]], sorted(tList, key=lambda x:x[1], reverse=True))))
    else:
        print(course[0], "->", topicLabels[course[1][-1][0]])

In [15]:
courseToScores = associateTopicScores(tm)

In [47]:
for c in courseToScores:
    printCourse(c)

Composites technology -> ['Cellular biology', 'Applied science']
Image Processing for Life Science -> ['Imagery', 'Lab environment']
Global business environment -> ['Business', 'Risk management']
Electrochemical nano-bio-sensing and bio/CMOS interfaces -> Imagery
Structural mechanics (for MT) -> ['Risk management']
Théorie et critique du projet MA2 (Boltshauser) -> ['Lab environment', 'Applied science']
Advanced principles and applications of systems biology -> ['Biology', 'Physics', 'Machine learning', 'Lab environment', 'Cellular biology']
Mass spectrometry -> ['Biology', 'Risk management']
Principles of digital communications -> ['Signal processing']
Hardware systems modeling I -> Thermodynamics
Quantitative systems modeling techniques -> ['Lab environment', 'Business', 'Architecture/Design', 'Physics', 'Applied science']
Medical radiation physics -> ['Imagery', 'Physics']
Bio-nano-chip design -> ['Imagery']
Principles of powder and densification processing -> Applied science
Fundam

Landmark Papers in Cancer and Infection -> ['Cellular biology', 'Applied science', 'Biology', 'Lab environment']
Advanced analog and RF integrated circuits design I -> ['Micro/Nano-electronics', 'Signal processing']
Advanced control systems -> ['Lab environment']
2D Layered Materials: Synthesis, Properties and Applications -> ['Imagery']
Game Theory -> Business
Linear system theory -> ['Signal processing', 'Lab environment']
Discrete mathematics -> ['Signal processing', 'Risk management', 'Business']
Emerging Nanopatterning Methods -> Imagery
Project in information technologies -> ['Lab environment', 'Imagery']
Graph theory -> ['Signal processing', 'Micro/Nano-electronics', 'Business']
Ceramics, structures and properties   TP -> ['Cellular biology', 'Risk management', 'Lab environment']
Strategic marketing & technology commercialization -> ['Risk management', 'Business', 'Lab environment', 'Applied science']
Lab immersion in industry A -> ['Lab environment']
Topics in theoretical compu

## Exercise 4.9: Dirichlet hyperparameters

## Exercise 4.10: EPFL's taught subjects

## Exercise 4.11: Wikipedia structure

In [20]:
rdd = sc.textFile("/ix/wikipedia-for-schools.txt").map(json.loads)

In [35]:
def seq(a,b):
    for el in b['tokens']:
        a.add(el)
    return a
def comb(a,b):
    return a.union(b)
wikiTerms = rdd.aggregate(set(), seq, comb)

In [56]:
wikiTermsList = sorted(list(wikiTerms))
wikiTermsDict = dict(zip(wikiTermsList, range(len(wikiTerms))))

In [57]:
print(wikiTermsDict['king'])

227108


In [42]:
elD = rdd.take(1)[0]

In [74]:
# Checks if given word is a number
def is_number(word):
    try:
        float(word)
        return True
    except ValueError:
        return False

    
# If the word passes the filters and should be in the dataset, returns True, False otherwise
def filter_word(word):
    # Words of len 1
    if len(word) < 2:
        return False
    # Removing words in stopwords
    #if word in stopwords:
        #return False
    # Removing words consisting of a punctuation sign
    if word in string.punctuation:
        return False
    # Removing numbers
    if is_number(word):
        return False
    return True

In [75]:
def createBOW(termList):
    resDict = {}
    for term in termList['tokens']:
        if (filter_word(term)):
            resDict[term] = resDict.get(term, 0) + 1
    return resDict

def createSparseTermVector(bow):
    values = []
    indices = []
    for k,v in sorted(bow.items(), key=lambda x:x[0]):
        values.append(v)
        indices.append(wikiTermsDict[k])
    return Vectors.sparse(len(values), indices, values)

def mapFunc(el):
    return [el['page_id'], createSparseTermVector(createBOW(el))]

vectorsRDD = rdd.map(mapFunc).persist()

In [None]:
wiki_ldaModel = LDA.train(vectorsRDD, k=30, docConcentration=2.01, topicConcentration=1.5)

In [None]:
topDesc = wiki_ldaModel.describeTopics()

In [None]:
def topWords(ldaTopicDesc, wordList):
    for topic in ldaTopicDesc:
        termList = []
        for i in range(10):
            termList.append(wordList[topic[0][i]])
            
        print(termList)

In [None]:
topWords(topDesc, wikiTermsList)