In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import ast
import pickle
from tangles.separations import SetSeparationSystem
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.patches import Patch
from movie_genre_tangles.convenience import *
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from tangles.search import TangleSearchTree

In [None]:
seps, sepidtokeyword = csv_to_setseperationsystem('../data/data_F24.csv')
with open('../orders/orders_O12_K24', 'rb') as f:
    order_O12 = pickle.load(f)
lvltokeyword = [sepidtokeyword[x] for x in np.argsort(order_O12)]
with open('../results/interesting_umbrella_tm_K24.pkl', 'rb') as f:
    interesting_umbrella_tm = pickle.load(f)
oriented_seps = seps[:,np.argsort(order_O12)]

In [None]:
lvltokeyword

Checks if any level is associated with more than one keyword:

In [None]:
np.any([len(x) > 1 for x in lvltokeyword])

In [None]:
for i in range(interesting_umbrella_tm.shape[0]):
    printkeywordsfromtm(interesting_umbrella_tm, i , lvltokeyword)

Calculates the sizes of the corpora of the tangles:

In [None]:
corpora = np.zeros((seps.shape[0], interesting_umbrella_tm.shape[0]))
for i in range(seps.shape[0]):
    match = (interesting_umbrella_tm[:,(oriented_seps[i] == 1)] == 1).sum(axis = 1)
    corpora[i, np.where(match >= 3)] = 1
corpora_sizes = (corpora == 1).sum(axis = 0)

How many movies are in the corpora of interesing umbrella tangles?

In [None]:
(corpora.max(axis = 1) == 1).sum()

Plots the matrix containing the interesting umbrella tangles:

In [None]:
cmap = plt.get_cmap('viridis')
legend_elements = [
    Patch(facecolor=cmap(1.0), label='positive side'),
    Patch(facecolor=cmap(0.0), label='negative side'),
    Patch(facecolor=cmap(0.5), label='neither'),
]

plt.matshow(interesting_umbrella_tm)
plt.gca().set_aspect('auto')
plt.gca().xaxis.set_label_position('top')
plt.xlabel('level of seperation')
plt.ylabel('tangle')
plt.xticks(ticks= range(933)[::25], rotation = 45)
plt.legend(handles=legend_elements, bbox_to_anchor=(1, 0.98), title = 'tangle contains')
plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/tm_O12_F24.png', bbox_inches='tight')

Counts keywords, that are not contained in any tangle:

In [None]:
not_contained_keywords = [lvltokeyword[x] for x in np.where((interesting_umbrella_tm == 1).sum(axis = 0) == 0)[0]]
len(not_contained_keywords)

Calculates the core of each tangle and counts the tangles with less than 3 movies in its core:

In [None]:
cores = {}
for i in range(interesting_umbrella_tm.shape[0]):
    pos = np.where(interesting_umbrella_tm[i] == 1)[0]
    core = np.where(oriented_seps[:,pos].min(axis = 1) == 1)[0]
    if core.size > 0:
        cores[i] = core   
len([x for x in cores.values() if x.size < 3])

In [None]:
data = pd.read_csv('../data/data_F24.csv')
data['keywords'] = data['keywords'].apply(lambda x: ast.literal_eval(x))
data["genres"] = data["genres"].apply(lambda x: ast.literal_eval(x))

In [None]:
np.sum([assigned_genres == [] for assigned_genres in data['genres']])

In [None]:
print(len(data))
data['genres'].explode().value_counts() /len(data)

In [None]:
fig = plot_genre_mat(data, corpora, range(19))
plt.title(r"tangle $\tau$")
cbar = plt.colorbar(fig, orientation = "horizontal", pad = 0.03, shrink = 0.8)
cbar.set_label("percentage of corpus tagged as genre")
cbar.set_ticks(ticks=[0.2,0.4,0.6,0.8],labels= ["20%", "40%", "60%", "80%"])
plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/genre_mat_F24.png', bbox_inches='tight')