In [1]:
import sys
sys.path.append("../../")

In [2]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from utils.kmeans.kmeans import KMeans
from utils.initial_data_handle import filter_genres, handle_non_numerical_data, standarize_dataframe, reverse_genre_mapping

# sns.set_style("darkgrid")

In [3]:
df = pd.read_csv("../../data/movie_data.csv", delimiter=';', encoding='utf-8')

In [4]:
selected_df, genre_mapping = handle_non_numerical_data(df)
standarized_df = standarize_dataframe(selected_df.dropna())
standarized_df['genres'] = selected_df['genres']
standarized_df.head(3)

Unnamed: 0,budget,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,vote_average,vote_count,genres
0,0.078947,0.402058,0.038462,0.083333,0.180612,0.176282,0.111111,0.846154,0.384725,1
1,0.171053,0.311705,0.115385,0.083333,0.12706,0.25,0.222222,0.758242,0.171439,2
2,0.042105,0.070659,0.038462,0.083333,0.039378,0.323718,0.111111,0.67033,0.002416,3


In [5]:
X = standarized_df.to_numpy()

In [6]:
def calculate_sse(max_clusters):
    wcss = []
    for i in range(1, max_clusters+1):
        kmeans = KMeans(i)
        kmeans.fit(X)
        wcss.append(kmeans.compute_sse())

    plt.plot(range(1, max_clusters+1), wcss)
    plt.title('SSE Elbow Method')
    plt.xlabel('Number of Clusters')
    plt.ylabel('SSE')
    plt.show()


In [7]:
# calculate_sse(10)

In [8]:
def calculate_wcss(max_clusters):
    wcss = []
    for i in range(1, max_clusters+1):
        kmeans = KMeans(k=i)
        kmeans.fit(X)
        wcss.append(kmeans.compute_wcss())

    plt.plot(range(1, max_clusters+1), wcss)
    plt.title('WCSS Elbow Method')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.show()


In [9]:
# calculate_wcss(10)

In [10]:
genres_df = filter_genres(df.dropna())
filtered_df, genre_mapping = handle_non_numerical_data(genres_df)
filtered_standarized_df = standarize_dataframe(filtered_df)
X_genre = filtered_standarized_df.to_numpy()

In [11]:
filtered_df.head(3)

Unnamed: 0,budget,genres,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,vote_average,vote_count
2,16000000.0,1,3.859495,1.0,1.0,81452156.0,127.0,1.0,6.1,34.0
3,60000000.0,2,17.924927,3.0,1.0,187436818.0,170.0,2.0,7.7,1886.0
4,35000000.0,2,5.23158,3.0,1.0,64350171.0,106.0,1.0,5.5,174.0


In [12]:
def predict_for_k(k : int):
    kmeans = KMeans(k)
    kmeans.fit(X_genre)
    predict_genres_df = filtered_standarized_df.drop(columns='genres', errors='ignore')
    X_predict = predict_genres_df.to_numpy()
    raw_predictions = kmeans.predict(X_predict)
    filtered_standarized_df['genres'] = filtered_df['genres'].to_numpy()
    filtered_standarized_df['predicted_genres'] = raw_predictions
    # filtered_standarized_df.head(3)
    filtered_standarized_df['predicted_genres'] += 1
    reverse_genre_mapping(filtered_standarized_df, genre_mapping)
    reverse_genre_mapping(filtered_standarized_df, genre_mapping, 'predicted_genres')
    return filtered_standarized_df

In [13]:
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import confusion_matrix

def plot_conf_matrix(text_df: pd.DataFrame, k : str):
    to_predict = text_df['genres']
    predictions = text_df['predicted_genres']
    class_labels = text_df['genres'].unique()

    cm = confusion_matrix(to_predict, predictions, labels=class_labels, normalize='true')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=sns.cubehelix_palette(as_cmap=True, rot=.2, gamma=.5))
    plt.title(f"Confusion Matrix for k = {k}")
    plt.colorbar()

    tick_marks = np.arange(len(class_labels))
    plt.xticks(tick_marks, class_labels, rotation=45)
    plt.yticks(tick_marks, class_labels)

    thresh = cm.max() / 2.
    for i in range(len(class_labels)):
        for j in range(len(class_labels)):
            plt.text(j, i, "{:.2f}%".format(cm[i, j] * 100),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    os.makedirs("output", exist_ok=True)
    plt.savefig(f"output/confusion_matrix_k{k}.png", bbox_inches='tight', dpi=1200)
    plt.close()

    precision = precision_score(to_predict, predictions, average='weighted')
    accuracy = accuracy_score(to_predict, predictions)
    error_precision = 1 - precision
    error_accuracy = 1 - accuracy

    print(f"Precision: {precision:.5f}, Error (Precision): {error_precision:.5f}")
    print(f"Accuracy: {accuracy:.5f}, Error (Accuracy): {error_accuracy:.5f}")

In [14]:
df_k3 = predict_for_k(k=3)
df_k3.head(3)

Unnamed: 0,budget,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,vote_average,vote_count,genres,predicted_genres
0,0.057143,0.070659,0.038462,0.083333,0.039378,0.260536,0.111111,0.67033,0.002416,Comedy,Drama
1,0.214286,0.328367,0.115385,0.083333,0.090622,0.425287,0.222222,0.846154,0.133996,Action,Action
2,0.125,0.095798,0.115385,0.083333,0.031109,0.180077,0.111111,0.604396,0.012362,Action,Drama


In [15]:
plot_conf_matrix(df_k3, k='3')

Precision: 0.26668, Error (Precision): 0.73332
Accuracy: 0.34894, Error (Accuracy): 0.65106
