# Synthetic Data Generation

In [None]:
import numpy as np
import pandas as pd
import tqdm

from topicnet.cooking_machine import Dataset

## Generating Texts

In [None]:
NUM_HIGHLY_RELATED_WORDS = 10
NUM_WORDS_IN_TOPIC = NUM_HIGHLY_RELATED_WORDS

NUM_WORDS_IN_DOCUMENT = 1000
MAIN_TOPIC_MINIMUM_PROBABILITY = 0.8

In [None]:
# Fixing the total number of documents here
# Another variant would be to fix the number of documents in each topic
# (and vary the total number of documents)

NUM_DOCUMENTS = 1000

NUM_TOPICS = 100
NUM_DOCUMENTS_IN_TOPIC = NUM_DOCUMENTS // NUM_TOPICS

assert NUM_DOCUMENTS_IN_TOPIC * NUM_TOPICS == NUM_DOCUMENTS

NUM_TOPICS_IN_DOCUMENT = min(NUM_TOPICS, 3)

In [None]:
MAIN_TOPIC_PROBABILITY = None
SECONDARY_TOPIC_PROBABILITY = None

NUM_SECONDARY_TOPICS_IN_DOCUMENT = NUM_TOPICS_IN_DOCUMENT - 1

if NUM_SECONDARY_TOPICS_IN_DOCUMENT > 0:
    MAIN_TOPIC_PROBABILITY = MAIN_TOPIC_MINIMUM_PROBABILITY
    SECONDARY_TOPIC_PROBABILITY = (1.0 - MAIN_TOPIC_MINIMUM_PROBABILITY) / NUM_SECONDARY_TOPICS_IN_DOCUMENT
else:
    MAIN_TOPIC_PROBABILITY = 1.0
    SECONDARY_TOPIC_PROBABILITY = 0.0

In [None]:
topics = [f't_{i:02}' for i in range(NUM_TOPICS)]

In [None]:
topics[:10]

['t_00',
 't_01',
 't_02',
 't_03',
 't_04',
 't_05',
 't_06',
 't_07',
 't_08',
 't_09']

Defining each topic's words

In [None]:
topic_words = {}

for t in topics:
    words = []
    words += [f'w__{t}__{i}' for i in range(NUM_HIGHLY_RELATED_WORDS)]
    
    # TODO: add background words, NUM_BACKGROUND_WORDS
    
    topic_words[t] = words

In [None]:
topic_words[topics[0]]

['w__t_00__0',
 'w__t_00__1',
 'w__t_00__2',
 'w__t_00__3',
 'w__t_00__4',
 'w__t_00__5',
 'w__t_00__6',
 'w__t_00__7',
 'w__t_00__8',
 'w__t_00__9']

In [None]:
len(topics)

100

Constructing the documents

In [None]:
documents = []

# TODO: memory inefficient
document_texts = {}
document_topics = {}

document_index = 0


for main_topic_index, main_topic in tqdm.tqdm(
        enumerate(topics), total=len(topics)):

    topics_except_main = topics[:main_topic_index] + topics[main_topic_index+1:]
    
    for i in range(NUM_DOCUMENTS_IN_TOPIC):
        secondary_topics = list(np.random.choice(
            topics_except_main,
            max(NUM_TOPICS_IN_DOCUMENT - 1, 0),
            replace=False
        ))

        doc = []

        doc_topics = [main_topic] + secondary_topics
        topic_probabilities = (
            [MAIN_TOPIC_PROBABILITY]
            + [SECONDARY_TOPIC_PROBABILITY] * len(secondary_topics)
        )

        word_topics = np.random.choice(
            doc_topics, NUM_WORDS_IN_DOCUMENT, p=topic_probabilities)

        for current_topic in word_topics:
            # TODO: add probabilities, for background words
            current_word = np.random.choice(topic_words[current_topic], 1)[0]
            doc.append(current_word)
        
        document_id = f'd_{document_index:04}'
        
        documents.append(document_id)
        document_texts[document_id] = doc
        document_topics[document_id] = doc_topics
        
        document_index += 1

100%|██████████| 100/100 [00:22<00:00,  4.42it/s]


In [None]:
documents[:5]

['d_0000', 'd_0001', 'd_0002', 'd_0003', 'd_0004']

In [None]:
document_topics[list(document_texts.keys())[0]]

['t_00', 't_34', 't_08']

In [None]:
document_texts[documents[0]][:10]

['w__t_08__1',
 'w__t_00__4',
 'w__t_00__9',
 'w__t_00__3',
 'w__t_00__7',
 'w__t_00__5',
 'w__t_00__0',
 'w__t_00__1',
 'w__t_00__2',
 'w__t_00__6']

## Phi & Theta Matrices

All the unique words (of all topics)

In [None]:
vocabulary = []

for words in topic_words.values():
    vocabulary += words

In [None]:
len(vocabulary)

1000

Filling a word-topic matrix Phi

In [None]:
phi = pd.DataFrame(
    index=vocabulary,
    columns=topics,
    data=np.zeros((len(vocabulary), len(topics)), dtype=float)
)

In [None]:
phi.shape

(1000, 100)

In [None]:
phi.head()

Unnamed: 0,t_00,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
w__t_00__0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
for topic in topics:
    words = topic_words[topic]
    word_probability = 1.0 / len(words)
    phi.loc[words, topic] = word_probability

# TODO:  + background words

In [None]:
phi.head(15)

Unnamed: 0,t_00,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
w__t_00__0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__3,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__4,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__5,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__6,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__7,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__8,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
w__t_00__9,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


...and a topic-document matrix Theta

In [None]:
theta = pd.DataFrame(
    index=topics,
    columns=documents,
    data=np.zeros((len(topics), len(documents)))
)

In [None]:
theta.head()

Unnamed: 0,d_0000,d_0001,d_0002,d_0003,d_0004,d_0005,d_0006,d_0007,d_0008,d_0009,...,d_0990,d_0991,d_0992,d_0993,d_0994,d_0995,d_0996,d_0997,d_0998,d_0999
t_00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
document_topics['d_0000']

['t_00', 't_34', 't_08']

In [None]:
for doc, doc_topics in document_topics.items():
    theta.loc[doc_topics, doc] = [MAIN_TOPIC_PROBABILITY] + \
        [SECONDARY_TOPIC_PROBABILITY] * (len(doc_topics) - 1)

In [None]:
theta.head()

Unnamed: 0,d_0000,d_0001,d_0002,d_0003,d_0004,d_0005,d_0006,d_0007,d_0008,d_0009,...,d_0990,d_0991,d_0992,d_0993,d_0994,d_0995,d_0996,d_0997,d_0998,d_0999
t_00,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
theta[theta['d_0000'] > 0]

Unnamed: 0,d_0000,d_0001,d_0002,d_0003,d_0004,d_0005,d_0006,d_0007,d_0008,d_0009,...,d_0990,d_0991,d_0992,d_0993,d_0994,d_0995,d_0996,d_0997,d_0998,d_0999
t_00,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_08,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t_34,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Saving the matrices

In [None]:
phi.to_csv(f'phi_{len(topics)}.csv')

In [None]:
theta.to_csv(f'theta_{len(topics)}.csv')

## Dataset

Here we are making a TopicNet's Dataset

In [None]:
DEFAULT_MODALITY = '@text'

In [None]:
dataset = pd.DataFrame()

dataset['id'] = documents
dataset['raw_text'] = [
    ' '.join(document_texts[d]) for d in documents
]
dataset['vw_text'] = [
    f'{d} |{DEFAULT_MODALITY} ' + " ".join(document_texts[d]) for d in documents
]

In [None]:
dataset.shape

(1000, 3)

In [None]:
dataset.head()

Unnamed: 0,id,raw_text,vw_text
0,d_0000,w__t_08__1 w__t_00__4 w__t_00__9 w__t_00__3 w_...,d_0000 |@text w__t_08__1 w__t_00__4 w__t_00__9...
1,d_0001,w__t_00__8 w__t_00__6 w__t_00__1 w__t_63__8 w_...,d_0001 |@text w__t_00__8 w__t_00__6 w__t_00__1...
2,d_0002,w__t_00__9 w__t_00__5 w__t_00__3 w__t_38__0 w_...,d_0002 |@text w__t_00__9 w__t_00__5 w__t_00__3...
3,d_0003,w__t_00__5 w__t_87__4 w__t_00__6 w__t_36__3 w_...,d_0003 |@text w__t_00__5 w__t_87__4 w__t_00__6...
4,d_0004,w__t_00__0 w__t_00__7 w__t_21__8 w__t_21__6 w_...,d_0004 |@text w__t_00__0 w__t_00__7 w__t_21__8...


In [None]:
dataset.iloc[0, 1][:250]

'w__t_08__1 w__t_00__4 w__t_00__9 w__t_00__3 w__t_00__7 w__t_00__5 w__t_00__0 w__t_00__1 w__t_00__2 w__t_00__6 w__t_00__9 w__t_00__3 w__t_34__1 w__t_00__3 w__t_08__8 w__t_34__8 w__t_00__1 w__t_00__6 w__t_00__8 w__t_00__3 w__t_08__7 w__t_08__9 w__t_34_'

In [None]:
dataset_path = f'/data/datasets/_tmp_alexeev/SyntheticDataset_{len(topics)}.csv'

dataset.to_csv(dataset_path, index=False)

Checking if all OK

In [None]:
saved_dataset = Dataset(dataset_path)

In [None]:
saved_dataset._data.head()

Unnamed: 0_level_0,id,raw_text,vw_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d_0000,d_0000,w__t_08__1 w__t_00__4 w__t_00__9 w__t_00__3 w_...,d_0000 |@text w__t_08__1 w__t_00__4 w__t_00__9...
d_0001,d_0001,w__t_00__8 w__t_00__6 w__t_00__1 w__t_63__8 w_...,d_0001 |@text w__t_00__8 w__t_00__6 w__t_00__1...
d_0002,d_0002,w__t_00__9 w__t_00__5 w__t_00__3 w__t_38__0 w_...,d_0002 |@text w__t_00__9 w__t_00__5 w__t_00__3...
d_0003,d_0003,w__t_00__5 w__t_87__4 w__t_00__6 w__t_36__3 w_...,d_0003 |@text w__t_00__5 w__t_87__4 w__t_00__6...
d_0004,d_0004,w__t_00__0 w__t_00__7 w__t_21__8 w__t_21__6 w_...,d_0004 |@text w__t_00__0 w__t_00__7 w__t_21__8...


In [None]:
del dataset
del saved_dataset