# SIMULATING DATA

In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
np.random.seed(0)

## DICTIONARY

In [None]:
dictionary = ['Athletics',
 'Archery',
 'Badminton',
 'Baseball',
 'Basketball',
 'Bobsleigh',
 'Bowling',
 'Boxing',
 'Canoeing',
 'Cycling',
 'Climbing',
 'Cricket',
 'Diving',
 'Equestrian',
 'Fencing',
 'Fishing',
 'Football',
 'Golf',
 'Gymnastics',
 'Handball',
 'Hiking',
 'Hockey',
 'Hunting',
 'Judo',
 'Karate',
 'Kayaking',
 'Lacrosse',
 'Motorsports',
 'Paddleboarding',
 'Paintball',
 'Paragliding',
 'Powerlifting',
 'Rafting',
 'Rockclimbing',
 'Rowing',
 'Rugby',
 'Running',
 'Sailing',
 'Shooting',
 'Skateboarding',
 'Skating',
 'Skeetshooting',
 'Skiing',
 'Skydiving',
 'Snowboarding',
 'Soccer',
 'Squash',
 'Surfcasting',
 'Surfing',
 'Swimming',
 'Note',
 'Chord',
 'Harmony',
 'Melody',
 'Rhythm',
 'Beat',
 'Tempo',
 'Pitch',
 'Key',
 'Scale',
 'Interval',
 'Tune',
 'Composition',
 'Song',
 'Lyrics',
 'Instrument',
 'Piano',
 'Guitar',
 'Bass',
 'Drums',
 'Violin',
 'Cello',
 'Trumpet',
 'Saxophone',
 'Clarinet',
 'Flute',
 'Harp',
 'Accordion',
 'Banjo',
 'Mandolin',
 'Ukulele',
 'Harmonica',
 'Synthesizer',
 'Sampler',
 'Mixer',
 'DJ',
 'Producer',
 'Conductor',
 'Singer',
 'Vocalist',
 'Backing Singer',
 'Choir',
 'Ensemble',
 'Band',
 'Soloist',
 'Improvisation',
 'Jazz',
 'Blues',
 'Rock',
 'Pop']

In [None]:
len(dictionary)

In [None]:
dictionary = [word.lower() for word in dictionary]

In [None]:
dictionary = np.array(dictionary)

In [None]:
dictionary

## TOPICS - DISTRIBUTION OVER WORDS

Topic is a distribution over words, so now we will make two vectors that represents two distributions

### SPORT DISTRIBUTION

90% probability is on first 10 words

The scale parameter, betha = 1 / lambda

In [None]:
# an array of 10 random numbers from an exponential distribution, normalized so it sums to .9
exp_array = np.random.exponential(1, size=10)
ten = (exp_array / np.sum(exp_array)) * 0.9

# an array of 90 random numbers from an exponential distribution, normalized so it sums to .1
exp_array = np.random.exponential(1, size=90)
ninety = (exp_array / np.sum(exp_array)) * 0.1


In [None]:
ten.sum(), ninety.sum()

##### Check that it sums to 1

In [None]:
b_sport = np.concatenate([ten, ninety])
b_sport.sum()
#sport_distribution

In [None]:
b_sport 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.plot(range(len(b_sport )), b_sport, marker='o')

# Set plot title and axis labels
plt.title('sport_distribution')
plt.xlabel('Position in the words vector')
plt.ylabel('Probability that the word will occur')

# Show plot
plt.show()

### MUSIC DISTRIBUTION

90% of probability is on last 10 words in the dictionary

In [None]:
# an array of 90 random numbers from an exponential distribution, normalized so it sums to .1
exp_array = np.random.exponential(1, size=90)
eighty = (exp_array / np.sum(exp_array)) * 0.1

# an array of 10 random numbers from an exponential distribution, normalized so it sums to .9
exp_array = np.random.exponential(1, size=10)
twenty = (exp_array / np.sum(exp_array)) * 0.9

In [None]:
eighty.sum(),twenty.sum()

##### Check that it sums to 1

In [None]:
b_music = np.concatenate([eighty, twenty])
b_music.sum()

In [None]:
b_music

In [None]:
plt.plot(range(len(b_music)), b_music, marker='o')

# Set plot title and axis labels
plt.title('music_distribution')
plt.xlabel('Position in the words vector')
plt.ylabel('Probability that the word will occur')

# Show plot
plt.show()

In [None]:
print(type(b_music)), print(type(b_sport)), print(type(dictionary))

In [None]:
import matplotlib.pyplot as plt


# Create a figure and axis
fig, ax = plt.subplots()

# Plot distribution_1 with a red color
ax.plot(range(len(b_sport )), b_sport, 'r', label='Sport')

# Plot distribution_2 with a blue color
ax.plot(range(len(b_music)), b_music, 'b', label='Music')


# Set labels and title
ax.set_xlabel('Index in Dictionary')
ax.set_ylabel('Probability')
ax.set_title('Probability Distributions')

# Add legend
ax.legend()

# Show the plot
plt.show()


##### MOST PROBABLE WORDS UNDER BOTH DISTRIBUTIONS

In [None]:
import numpy as np
import pandas as pd

def most_probable_words(words, probabilities):
    # create a DataFrame from the words and probabilities
    df = pd.DataFrame({'word': words, 'probability': probabilities})
    
    # sort by probability in descending order
    df_sorted = df.sort_values(by='probability', ascending=False)
    
    # get the top 10 words
    top_10_words = df_sorted.head(10)['word'].values
    
    return list(top_10_words)


In [None]:
most_probable_words(dictionary, b_sport )

In [None]:
most_probable_words(dictionary, b_music )

# GENERATING DOCUMENTS

150 document

In [None]:
K = ['Sport', 'Music'] 
phi = [0.3, 0.7]
n = 100 # number of words per document


# Empty list for generated documents
documents = []


for i in range(150):
    # Choose a topic 
    chosen_topic = np.random.choice(K, p = phi)

    if chosen_topic == 'Sport':
        chosen_probs = b_sport 
    else:
        chosen_probs = b_music

    doc_words = [np.random.choice(dictionary, p=chosen_probs) for j in range(n)]   
    doc_string = ' '.join(doc_words)   
    documents.append(doc_string)

In [None]:
documents[:10]

1000 documents

In [None]:
K = ['Sport', 'Music'] 
phi = [0.3, 0.7]
n = 100 # number of words per document


# Empty list for generated documents
long_documents = []


for i in range(1000):
    # Choose a topic 
    chosen_topic = np.random.choice(K, p = phi)

    if chosen_topic == 'Sport':
        chosen_probs = b_sport 
    else:
        chosen_probs = b_music

    doc_words = [np.random.choice(dictionary, p=chosen_probs) for j in range(n)]   
    doc_string = ' '.join(doc_words)   
    long_documents.append(doc_string)

# DOCUMENT TERM MATRIX

In [None]:
import pandas as pd

In [None]:
# Create DataFrame
short = pd.DataFrame(documents, columns=['documents'])
# Print DataFrame
# print(short)

In [None]:
# Create DataFrame
long = pd.DataFrame(long_documents, columns=['documents'])
#print(long)

short DTM:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# Learn the vocabulary dictionary and return document-term matrix.
X = vectorizer.fit_transform(short['documents'])

# Create a DataFrame
short_dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

print(short_dtm)

long DTM:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# Learn the vocabulary dictionary and return document-term matrix.
X = vectorizer.fit_transform(long['documents'])

# Create a DataFrame
long_dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

print(long_dtm)

Now we want to save both matrices and topics for further use in the EM alghorithm

In [None]:
import pandas as pd

df = pd.DataFrame(short_dtm)

# Save DataFrame to CSV
df.to_csv('short_dtm.csv', index=False)


In [None]:
import pandas as pd

df = pd.DataFrame(long_dtm)

# Save DataFrame to CSV
df.to_csv('long_dtm.csv', index=False)

In [None]:
import numpy as np

# Combine the arrays. Here we stack them vertically using vstack
combined_array = np.vstack((b_sport, b_music))

# Transpose if you want each array as a column instead of a row
combined_array = combined_array.T

# Save to a CSV file
np.savetxt("beta_original.csv", combined_array, delimiter=",")
