In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import ast
import pickle
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import matplotlib.cm as cm
from movie_genre_tangles.convenience import *
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
seps, sepidtokeyword = csv_to_setseperationsystem('../data/data_F3.csv')
with open('../orders/orders_O12_K3', 'rb') as f:
    order_O12 = pickle.load(f)
lvltokeyword = [sepidtokeyword[x] for x in np.argsort(order_O12)[:140]]
with open('../results/interesting_umbrella_tm_K3.pkl', 'rb') as f:
    interesting_umbrella_tm = pickle.load(f)
oriented_seps = seps[:,np.argsort(order_O12)[:140]]

Checks if any level is associated with more than one keyword:

In [None]:
np.any([len(x) > 1 for x in lvltokeyword])

Plots the matrix containing the interesting umbrella tangles:

In [None]:
cmap = plt.get_cmap('viridis')
legend_elements = [
    Patch(facecolor=cmap(1.0), label='positive side'),
    Patch(facecolor=cmap(0.0), label='negative side'),
    Patch(facecolor=cmap(0.5), label='neither'),
]

plt.matshow(interesting_umbrella_tm[:600])
plt.gca().set_aspect('auto')
plt.gca().xaxis.set_label_position('top')
plt.legend()
plt.xlabel('order of potential feature')
plt.ylabel('tangle')
plt.legend(handles=legend_elements, bbox_to_anchor=(1, 0.98), title = 'tangle contains')
#plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/tm_entropy_K_1.png', bbox_inches='tight')

plt.matshow(interesting_umbrella_tm[600:])
plt.gca().set_aspect('auto')
plt.gca().xaxis.set_label_position('top')
plt.xlabel('order of potential feature')
plt.ylabel('tangle')
plt.yticks(ticks = np.arange(0, 601, 100), labels = np.arange(600, 1231, 100))
#plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/tm_entropy_K_2.png', bbox_inches='tight')

Plots for each level the percentage of tangles containing the assoziated keyword:

In [None]:
plt.scatter(range(interesting_umbrella_tm.shape[1]),((interesting_umbrella_tm == 1).sum(axis = 0) / interesting_umbrella_tm.shape[0]))
plt.xlabel(r'order of potential feature $k$')
plt.ylabel(r'proportion of tangles containing $k$')
#plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/proportions_separations.png', bbox_inches='tight')

Counts the number of interessting umbrella tangles containing at least 1 and 2 keywords assosiated with one of the first 10 levels:

In [None]:
print(((interesting_umbrella_tm[:,:10] == 1).sum(axis = 1) >= 1).sum())
print(((interesting_umbrella_tm[:,:10] == 1).sum(axis = 1) >= 2).sum())

Prints all keywords, that are not contained in any tangle:

In [None]:
[lvltokeyword[x] for x in np.where((interesting_umbrella_tm == 1).sum(axis = 0) == 0)[0]]

Calculates the core of each tangle and counts the tangles with less than 3 movies in its core:

In [None]:
cores = {}
for i in range(interesting_umbrella_tm.shape[0]):
    pos = np.where(interesting_umbrella_tm[i] == 1)[0]
    core = np.where(oriented_seps[:,pos].min(axis = 1) == 1)[0]
    cores[i] = core.size   
small_cores = [x for x in cores.keys() if cores[x] < 3]
len(small_cores)

Calculates corpora of the tangles and their sizes:

In [None]:
corpora = np.zeros((seps.shape[0], interesting_umbrella_tm.shape[0]))
for i in range(seps.shape[0]):
    match = (interesting_umbrella_tm[:,(oriented_seps[i] == 1)] == 1).sum(axis = 1)
    corpora[i, np.where(match >= 3)] = 1
corpora_sizes = (corpora == 1).sum(axis = 0)

Counts how many movies are in the corpus of at least one tangle:

In [None]:
((corpora == 1).any(axis = 1)).sum()

Calculates the first two principal componets of the matrix containing the interesting umbrella tangles:

In [None]:
pca = PCA(n_components= 2)
principal_components = pca.fit_transform(interesting_umbrella_tm)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

Finds the optimal number of clusters:

In [None]:
silhouette_scores = []
k_values = range(2, 50)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state = 42)
    kmeans.fit(pca_df[['PC1', 'PC2']])
    score = silhouette_score(pca_df[['PC1', 'PC2']], kmeans.labels_)
    silhouette_scores.append(score)
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('number of cluster k')
plt.ylabel('Silhouette-Score')
plt.title('Silhouette-Analyse')
plt.grid()
plt.show()

print("For", k_values[np.argmax(silhouette_scores)], "clusters we get the best average silhouette score of", np.max(silhouette_scores))

Clusters the Tangles and plots them:

In [None]:
kmeans = KMeans(n_clusters=10, random_state=42)
pca_df['Cluster'] = kmeans.fit_predict(pca_df[['PC1', 'PC2']])

plt.figure(figsize=(6,6))
scatter = plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster'], cmap = cm.tab10, s=50)
plt.title(r'Cluster of interesting umbrella Tangles of $F^3$ with agreement $\geq 3$')
plt.xlabel('$p_1$')
plt.ylabel('$p_2$')

for i in range(10):
    plt.scatter([], [], c=[cm.tab10(plt.Normalize(0, 9)(i))], label=str(i))
plt.legend(title='Cluster', loc='best')
#plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/cluster_ag3.png', bbox_inches='tight')

Prints all tangles, sorted by cluster:

In [None]:
for i in np.unique(pca_df['Cluster'].values):
    print('CLUSTER', i)
    for t in np.where(pca_df['Cluster'] == i)[0]:
        printkeywordsfromtm(interesting_umbrella_tm, t, lvltokeyword)

Chooses the tangle with largest corpus for each cluster as it's representative:

In [None]:
representatives = []
for i in np.unique(pca_df['Cluster'].values):
    print('CLUSTER', i)
    print('has ', (pca_df['Cluster'] == i).sum(), ' tangles')
    mask =  np.where(pca_df['Cluster'] != i)[0]
    masked_corpora = corpora_sizes.copy()
    masked_corpora[mask] = 0
    representative = np.argmax(masked_corpora)
    representatives.append(representative)
    printkeywordsfromtm(interesting_umbrella_tm, representative, lvltokeyword)
    print("corpus has size ", masked_corpora[representative])
    print("its core has size", cores[representative])

How many unique movies are in the corpora of the representatives?

In [None]:
(corpora[:,representatives].max(axis = 1) == 1).sum()

The following is the Greedy Genre Ranking Algorithm:

In [None]:
genres = []
genre_sizes = []
cc = corpora.copy()
#while 1 in cc:
for i in range(20): #used to stop after j steps
    next_genre = np.argmax(cc.sum(axis=0))
    genres.append(next_genre)
    genre_sizes.append(cc[:,next_genre].sum())
    cc[cc[:,next_genre] == 1] = 0
i = 1
for t in genres:
    printkeywordsfromtm(interesting_umbrella_tm, t, lvltokeyword)
    print(i, "-th genre contains", corpora_sizes[t], "movies")
    print("its core has size", cores[t])
    i +=1
    

How many unique movies are in the corprora of the first 10 greedely chosen tangels?

In [None]:
(corpora[:,genres[:10]].max(axis = 1) == 1).sum()

In [None]:
misc_tangles = [646, 653,906, 1053, 1092, 1146]
print([cores[i] for i in misc_tangles])
print([corpora_sizes[i] for i in misc_tangles])

Genre Matrixes:

In [None]:
data = pd.read_csv('../data/data_F3.csv')
data['keywords'] = data['keywords'].apply(lambda x: ast.literal_eval(x))
data["genres"] = data["genres"].apply(lambda x: ast.literal_eval(x))
len(data) == seps.shape[0]

In [None]:
np.sum([assigned_genres == [] for assigned_genres in data['genres']])

In [None]:
print(len(data))
data['genres'].explode().value_counts() / len(data)

In [None]:
fig = plot_genre_mat(data, corpora, representatives)
plt.title("representatives of clusters of tangles of $F^3$")
cbar = plt.colorbar(fig, orientation = "horizontal", pad = 0.03, shrink = 0.8)
cbar.set_label("percentage of corpus tagged as genre")
cbar.set_ticks(ticks=[0.2,0.4,0.6,0.8],labels= ["20%", "40%", "60%", "80%"])
#plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/genre_mat_cluster_3.png', bbox_inches='tight')

In [None]:
fig = plot_genre_mat(data, corpora, genres[:10])
plt.title("first greedely ranked tangles of $F^3$")
cbar = plt.colorbar(fig, orientation = "horizontal", pad = 0.03, shrink = 0.8)
cbar.set_label("percentage of corpus assinged to genre")
cbar.set_ticks(ticks=[0.2,0.4,0.6,0.8],labels= ["20%", "40%", "60%", "80%"])
#plt.savefig('/home/ocke/Documents/BA-Tangles/tex/images/genre_mat_greedy_3.png', bbox_inches='tight')

Miscellanious calucaltions below:

How many movies are tagged with blood, blood and gore, and gore resp?

In [None]:
print((oriented_seps[:,12] == 1).sum())
print((oriented_seps[oriented_seps[:,14] == 1,12] == 1).sum())
print((oriented_seps[:,14] == 1).sum())

Prints the corpora of the representatives of the clusters:

In [None]:
for i in range(10):
    print("The corpus of the representative of cluster", i, "contains:")
    print(data['original_title'][corpora[:,representatives[i]] == 1])