In [1]:
import collections
import fasttext
import gzip
import json
import math
import matplotlib
import nltk
import os
import pickle
import random
import scipy.sparse
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zstandard as zstd

from collections import Counter
from gensim.models.coherencemodel import CoherenceModel
from joblib import dump, load
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import LongType, StructField, StructType
from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel
from pyspark.ml.linalg import Vectors, SparseVector
from scipy.sparse import dok_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Get top 5 terms per topic with one intruder

In [2]:
conf = SparkConf().setMaster("local[4]").setAll(
    [('spark.executor.memory', '2g'), ('spark.driver.memory', '8g'), ('spark.driver.maxResultSize', '0')])

# create the session
spark = SparkSession.builder.appName(
    "LDA_topicmodelling").config(conf=conf).getOrCreate()

### Functions

In [3]:
def get_topic_terms(n_topic, use_bigram):
    '''Return a dictionnary of topic_id as keys and a list of terms as values, 
    where the terms are sorted by their distribution in the topic (descending)'''
    
    i_topic = 0
    dict_ = {}
    filename = 'describe_topics_' + str(n_topic) + '.json'

    if use_bigram:
        path_file = '/dlabdata1/youtube_large/olam/data/with_ngram/describe_topics/' + filename
        path_id2word = '/dlabdata1/youtube_large/olam/data/with_ngram/id2word_top20.pickle'
    else:
        path_file = '/dlabdata1/youtube_large/olam/data/final_res/describe_topics/' + filename
        path_id2word = '/dlabdata1/youtube_large/olam/data/final_res/id2word_top20.pickle'
        
    with open(path_id2word, 'rb') as f:
        id2word = pickle.load(f)
    f.close()
        
    describe_topics = spark.read.json(path_file)

    for row in describe_topics.sort('topic').rdd.collect():

        term_weights = row.termWeights   

        for i, token_id in enumerate(row.termIndices):

            if i == 0:
                dict_[i_topic] = [id2word[token_id]]
            else:
                dict_[i_topic].append(id2word[token_id])

        i_topic += 1
    
    return dict_

In [4]:
def get_top5_terms_per_topic(dict_topic_term):
    '''Return a dictionnary of topic_id as keys and a list top the top5 terms for each topic'''
    
    dict_top5 = {}

    for key, val in dict_topic_term.items():
        dict_top5[key] = dict_topic_term[key][:5]
        
    return dict_top5

In [5]:
def get_all_top_terms(dict_top5):
    '''Return the set of all top5 terms'''
    
    set_all_top_terms = []

    for _, val in dict_top5.items():
        set_all_top_terms.extend(val)
        
    return set(set_all_top_terms)

In [6]:
def get_intruders(dict_topic_term, dict_top5, set_all_top_terms):
    '''Return a dictionnary of topic_id as keys and the corresponding intruder'''
    
    dict_furthest_intruder_per_topic = {}

    for key, value in dict_topic_term.items():
        set_all_top_terms_tmp = set_all_top_terms.copy()

        for term in dict_top5[key]:
            set_all_top_terms_tmp.remove(term)

        worst_term = ''
        worst_idx = 0

        for term in set_all_top_terms:
            idx = dict_topic_term[key].index(term)
            if idx >= worst_idx and idx <= len(dict_topic_term[key]) / 100:
                worst_idx = idx
                worst_term = term

        dict_furthest_intruder_per_topic[key] = worst_term
        
    return dict_furthest_intruder_per_topic

In [7]:
def get_data(n_topic, dict_top5, dict_furthest_intruder_per_topic):
    '''Return matrix of all top5 terms shuffled with the intruder'''
    
    data = []

    for i in range(n_topic):
        terms = []

        terms.extend(dict_top5[i])
        terms.append(dict_furthest_intruder_per_topic[i])

        random.shuffle(terms)

        data.append(terms)
        
    return data

In [8]:
def get_csv_file(data, n_topic, use_bigram):
    '''Save the terms with intruder into a csv file'''
    
    columns = ['Term 1', 'Term 2', 'Term 3', 'Term 4', 'Term 5', 'Term 6']

    df = pd.DataFrame(data, columns=columns)
    
    if use_bigram:
        path_file = '/home/olam/intruder' + str(n_topic) + '_bigram.csv'
    else:
        path_file = '/home/olam/intruder' + str(n_topic) + '.csv'
    
    df.to_csv(path_file)

In [9]:
def intruder(n_topic, use_bigram):
    '''Run the intruder pipeline, in order to get a csv file with the top5 
    terms and with one intruder and return the dictionnary of intruder'''
    
    dict_topic_term = get_topic_terms(n_topic, use_bigram)
    dict_top5 = get_top5_terms_per_topic(dict_topic_term)
    set_all_top_terms = get_all_top_terms(dict_top5)
    dict_furthest_intruder_per_topic = get_intruders(dict_topic_term, dict_top5, set_all_top_terms)
    data = get_data(n_topic, dict_top5, dict_furthest_intruder_per_topic)
    
    random_indices = random.sample(list(np.arange(0, len(data))), 20)
    random_indices.sort()
    data = [data[i] for i in random_indices]
    
    get_csv_file(data, n_topic, use_bigram)
    
    return {idx: dict_furthest_intruder_per_topic[idx] for idx in random_indices}

### Get the csv file with intruder

In [10]:
dict_furthest_intruder_per_topic_55_bigram = intruder(n_topic=55, use_bigram=True)

In [16]:
dict_furthest_intruder_per_topic_55 = intruder(n_topic=55, use_bigram=False)

In [18]:
with open('/home/olam/intruder/intruder55_abel.pickle', 'wb') as f:
    pickle.dump(dict_furthest_intruder_per_topic_55, f)
f.close()

In [36]:
dict_furthest_intruder_per_topic_110 = intruder(n_topic=110, use_bigram=False)

In [37]:
with open('/home/olam/intruder/intruder110_abel.pickle', 'wb') as f:
    pickle.dump(dict_furthest_intruder_per_topic_110, f)
f.close()

## Get analyze results

In [10]:
def get_accuracy(df, dict_intruder):
    '''
    Parameters
    ----------
    df : pandas Dataframe
        Results of the user 
    dict_intruder : dict
        Dictionnary that contains the groundtruth intruder
    
    Returns
    ----------
    The accuracy of the correct intruder detected by a user
    '''
    return (np.array(df['Intruder']) == np.array(list(dict_intruder.values()))).sum() / len(dict_intruder)

In [11]:
# Paul

df110_paul = pd.read_csv('/home/olam/intruder_res/intruder110_Paul.csv')

with open('/home/olam/intruder/intruder110_paul.pickle', 'rb') as f:
    dict_intruder110_paul = pickle.load(f)
f.close()

accuracy_110_paul = get_accuracy(df110_paul, dict_intruder110_paul)
print('Accuracy for Paul with 110 topics: ' + str(accuracy_110_paul))

Accuracy for Paul with 110 topics: 0.3


In [14]:
# Nico

df55_nico = pd.read_csv('/home/olam/intruder_res/intruder55_Nico.csv')

with open('/home/olam/intruder/intruder55_nico.pickle', 'rb') as f:
    dict_intruder55_nico = pickle.load(f)
f.close()

accuracy_55_nico = get_accuracy(df55_nico, dict_intruder55_nico)
print('Accuracy for Nico with 55 topics: ' + str(accuracy_55_nico))

Accuracy for Nico with 55 topics: 0.25


In [25]:
# Olivier55

df55_olivier = pd.read_csv('/home/olam/intruder_res/intruder55_Olivier.csv')

with open('/home/olam/intruder/intruder55_olivier.pickle', 'rb') as f:
    dict_intruder55_olivier = pickle.load(f)
f.close()

accuracy_55_olivier = get_accuracy(df55_olivier, dict_intruder55_olivier)
print('Accuracy for Olivier with 55 topics: ' + str(accuracy_55_olivier))

Accuracy for Olivier with 55 topics: 0.55


In [27]:
# Robin

df55_robin = pd.read_csv('/home/olam/intruder_res/intruder55_Robin.csv')

with open('/home/olam/intruder/intruder55_robin.pickle', 'rb') as f:
    dict_intruder55_robin = pickle.load(f)
f.close()

accuracy_55_robin = get_accuracy(df55_robin, dict_intruder55_robin)
print('Accuracy for Robin with 55 topics: ' + str(accuracy_55_robin))

Accuracy for Robin with 55 topics: 0.6


In [30]:
# Olivier110

df110_olivier = pd.read_csv('/home/olam/intruder_res/intruder110_Olivier.csv')

with open('/home/olam/intruder/intruder110_olivier.pickle', 'rb') as f:
    dict_intruder110_olivier = pickle.load(f)
f.close()

accuracy_110_olivier = get_accuracy(df110_olivier, dict_intruder110_olivier)
print('Accuracy for Olivier with 110 topics: ' + str(accuracy_110_olivier))

Accuracy for Olivier with 110 topics: 0.35


In [28]:
# Stan55

df55_stan = pd.read_csv('/home/olam/intruder_res/intruder55_Stan.csv')

with open('/home/olam/intruder/intruder55_stan.pickle', 'rb') as f:
    dict_intruder55_stan = pickle.load(f)
f.close()

accuracy_55_stan = get_accuracy(df55_stan, dict_intruder55_stan)
print('Accuracy for Stan with 55 topics: ' + str(accuracy_55_stan))

Accuracy for Stan with 55 topics: 0.45


In [29]:
# Stan110

df110_stan = pd.read_csv('/home/olam/intruder_res/intruder110_Stan.csv')

with open('/home/olam/intruder/intruder110_stan.pickle', 'rb') as f:
    dict_intruder110_stan = pickle.load(f)
f.close()

accuracy_110_Stan = get_accuracy(df110_stan, dict_intruder110_stan)
print('Accuracy for Stan with 110 topics: ' + str(accuracy_110_Stan))

Accuracy for Stan with 110 topics: 0.35


In [32]:
# Dani

df55_dani = pd.read_csv('/home/olam/intruder_res/intruder55_Dani.csv')

with open('/home/olam/intruder/intruder55_dani.pickle', 'rb') as f:
    dict_intruder55_dani = pickle.load(f)
f.close()

accuracy_55_dani = get_accuracy(df55_dani, dict_intruder55_dani)
print('Accuracy for Dani with 55 topics: ' + str(accuracy_55_dani))

Accuracy for Dani with 55 topics: 0.55


In [34]:
# Olivia

df55_olivia = pd.read_csv('/home/olam/intruder_res/intruder55_Olivia.csv')

with open('/home/olam/intruder/intruder55_olivia.pickle', 'rb') as f:
    dict_intruder55_olivia = pickle.load(f)
f.close()

accuracy_55_olivia = get_accuracy(df55_olivia, dict_intruder55_olivia)
print('Accuracy for Olivia with 55 topics: ' + str(accuracy_55_olivia))

Accuracy for Olivia with 55 topics: 0.4


In [35]:
# Kevin

df55_kevin = pd.read_csv('/home/olam/intruder_res/intruder55_Kevin.csv')

with open('/home/olam/intruder/intruder55_kevin.pickle', 'rb') as f:
    dict_intruder55_kevin = pickle.load(f)
f.close()

accuracy_55_Kevin = get_accuracy(df55_kevin, dict_intruder55_kevin)
print('Accuracy for Kevin with 55 topics: ' + str(accuracy_55_Kevin))

Accuracy for Kevin with 55 topics: 0.35


In [33]:
# Jalel

df110_jalel = pd.read_csv('/home/olam/intruder_res/intruder110_Jalel.csv')

with open('/home/olam/intruder/intruder110_jalel.pickle', 'rb') as f:
    dict_intruder110_jalel = pickle.load(f)
f.close()

accuracy_110_jalel = get_accuracy(df110_jalel, dict_intruder110_jalel)
print('Accuracy for Jalel with 110 topics: ' + str(accuracy_110_jalel))

Accuracy for Jalel with 110 topics: 0.3


In [11]:
# Larry

df110_larry = pd.read_csv('/home/olam/intruder_res/intruder110_Larry.csv')

with open('/home/olam/intruder/intruder110_larry.pickle', 'rb') as f:
    dict_intruder110_larry = pickle.load(f)
f.close()

accuracy_110_larry = get_accuracy(df110_larry, dict_intruder110_larry)
print('Accuracy for Larry with 110 topics: ' + str(accuracy_110_larry))

Accuracy for Larry with 110 topics: 0.45


In [12]:
# Rayane

df110_rayane = pd.read_csv('/home/olam/intruder_res/intruder110_Rayane.csv')

with open('/home/olam/intruder/intruder110_rayane.pickle', 'rb') as f:
    dict_intruder110_rayane = pickle.load(f)
f.close()

accuracy_110_rayane = get_accuracy(df110_rayane, dict_intruder110_rayane)
print('Accuracy for Rayane with 110 topics: ' + str(accuracy_110_rayane))

Accuracy for Rayane with 110 topics: 0.2


In [11]:
# Abel

df110_abel = pd.read_csv('/home/olam/intruder_res/intruder110_Abel.csv')

with open('/home/olam/intruder/intruder110_abel.pickle', 'rb') as f:
    dict_intruder110_abel = pickle.load(f)
f.close()

accuracy_110_abel = get_accuracy(df110_abel, dict_intruder110_abel)
print('Accuracy for Abel with 110 topics: ' + str(accuracy_110_abel))

Accuracy for Abel with 110 topics: 0.05


In [12]:
dict_intruder110_abel

{9: 'tip',
 12: 'mark',
 13: 'age',
 30: 'famili',
 35: 'old',
 38: 'snow',
 43: 'last',
 45: 'man',
 54: 'custom',
 64: 'class',
 68: 'back',
 72: 'mark',
 76: 'tip',
 79: 'show',
 81: 'last',
 87: 'year',
 95: 'review',
 98: 'fast',
 102: 'reaction',
 107: 'record'}

In [14]:
# With selecting worst intruder possible in the top5 of other topics

df55 = pd.read_csv('/home/olam/intruder55_oli.csv')

accuracy_55_oli = get_accuracy(df55, dict_furthest_intruder_per_topic_55)
print('Accuracy for Olivier with 55 topics: ' + str(accuracy_55_oli))

Accuracy for Olivier with 55 topics: 0.8909090909090909


In [17]:
# With selecting worst intruder possible in the top5 of other topics

df55_stan = pd.read_csv('/home/olam/intruder55_stan.csv')

accuracy_55_stan = get_accuracy(df55_stan, dict_furthest_intruder_per_topic_55)
print('Accuracy for Stan with 55 topics: ' + str(accuracy_55_stan))

Accuracy for Stan with 55 topics: 0.6


In [21]:
# With selecting worst intruder possible in the top5 of other topics

df110 = pd.read_csv('/home/olam/intruder110_oli.csv')

accuracy_110_oli = 2 * get_accuracy(df110, dict_furthest_intruder_per_topic_110)
print('Accuracy for Olivier with 110 topics: ' + str(accuracy_110_oli))

Accuracy for Olivier with 110 topics: 0.7636363636363637
