## Load Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
from sklearn.preprocessing import power_transform
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform

from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score, davies_bouldin_score, silhouette_samples



In [None]:
def description(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary['Missing'] = df.isnull().sum().values
    summary['PercMissing'] = df.isnull().sum().values / df.isnull().count().values
    summary['Uniques'] = df.nunique().values
    summary['Data type'] = df.dtypes.values
    summary = summary.merge(df.describe().transpose().reset_index(), on = "index",how="left")

    return summary


In [None]:
def jaccard_similarity_continuous_rows_df(df, idx1, idx2):
    """
    Compute the Jaccard similarity coefficient for continuous data between two rows in a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    idx1 (int): Index of the first row.
    idx2 (int): Index of the second row.

    Returns:
    float: Jaccard similarity coefficient.
    """
    import numpy as np

    # Extract the values of the two rows
    x = df.iloc[idx1].values
    y = df.iloc[idx2].values

    # Calculate intersection and union
    intersection = np.minimum(x, y).sum()
    union = np.maximum(x, y).sum()

    # Return Jaccard similarity coefficient
    return intersection / union if union != 0 else 0

def analyze_web_similarity(df, n_similar=5, standardize=False):
    """
    Analyzuje podobnost webových stránek na základě jejich features.
    
    # Provedení analýzy
    results = analyze_web_similarity(df)

    # Vizualizace výsledků
    visualize_similarities(results)
    
    Parameters:
    df: DataFrame s názvy webů v prvním sloupci a features v dalších sloupcích
    n_similar: Počet podobných stránek k zobrazení pro každou stránku
    
    Returns:
    dict: Slovník s různými metrikami podobnosti a analýzami
    """
    # Oddělení názvů a features
    websites = df.iloc[:, 0]
    features = df.iloc[:, 1:]
    
    # # Standardizace features
    if standardize:
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(features)
    else:
        features_scaled = features
    
    # 1. Kosinová podobnost
    cosine_sim = cosine_similarity(features_scaled)
    
    # 2. Euklidovská vzdálenost
    euclidean_dist = squareform(pdist(features_scaled, 'euclidean'))
    
    # Vytvoření dictionary pro každou webovou stránku s jejími nejpodobnějšími protějšky
    similarity_dict = {}
    for i, website in enumerate(websites):
        # Najdeme nejpodobnější stránky (kromě sebe sama)
        similar_indices = cosine_sim[i].argsort()[::-1][1:n_similar+1]
        similar_websites = [
            {
                'web': websites.iloc[idx],
                'cosine_similarity': cosine_sim[i][idx],
                'euclidean_distance': euclidean_dist[i][idx],
                "jaccard_sim" : jaccard_similarity_continuous_rows_df(features, i, idx)
            }
            for idx in similar_indices
        ]
        similarity_dict[website] = similar_websites
    
    # Základní statistiky features
    feature_stats = {
        'correlation_matrix': features.corr(),
        'feature_importance': np.std(features_scaled, axis=0),
        'feature_names': features.columns
    }
    
    return {
        'similarity_dict': similarity_dict,
        'cosine_similarity_matrix': cosine_sim,
        'euclidean_distance_matrix': euclidean_dist,
        'feature_stats': feature_stats
    }

def visualize_similarities(analysis_results, n_clusters=10):
    """
    Vizualizuje výsledky analýzy podobnosti.
    """
    # Vytvoření heat mapy korelací features
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        analysis_results['feature_stats']['correlation_matrix'],
        cmap='coolwarm',
        center=0,
        annot=False
    )
    plt.title('Korelační matice features')
    plt.tight_layout()
    
    # Vytvoření grafu důležitosti features
    plt.figure(figsize=(12, 6))
    feature_importance = pd.Series(
        analysis_results['feature_stats']['feature_importance'],
        index=analysis_results['feature_stats']['feature_names']
    )
    feature_importance.sort_values(ascending=True).plot(kind='barh')
    plt.title('Důležitost jednotlivých features')
    plt.xlabel('Standardizovaná směrodatná odchylka')
    plt.tight_layout()

In [None]:
data = pd.read_csv('sites_embedding.csv')

In [None]:
data.head()

In [None]:
summary_data = description(data)
summary_data


In [None]:
data_tranformed = data.copy()
for col in data.columns[1:]:
    data_tranformed[col + "_transformed"], fitted_lambda = stats.boxcox(data_tranformed[col] + 1)
    print(f"Column {col} transformed with lambda {fitted_lambda}")

data_tranformed = data_tranformed.drop(columns=data.columns[1:25])

In [None]:
summary_data_transf = description(data_tranformed)
summary_data_transf

In [None]:
summary_data.plot(x='Name', y='mean', kind='bar', figsize=(12, 6))
summary_data.plot(x='Name', y='std', kind='bar', figsize=(12, 6))

In [None]:
print(data_tranformed.iloc[:, 1:].sum())

In [None]:
data_tranformed.iloc[:, 1:25].sum().plot(kind="bar", figsize=(10, 6))

In [None]:
data_scaled = data.copy()
scaler = StandardScaler()
data_scaled.iloc[:, 1:] = scaler.fit_transform(data_scaled.iloc[:, 1:])
description(data_scaled)

In [None]:
# Pairplot to visualize relationships
sns.pairplot(data)
plt.show()

In [None]:
# Pairplot to visualize relationships
sns.pairplot(data_tranformed)
plt.show()

In [None]:
# for col in data.columns[1:]:
#     plt.figure(figsize=(10, 6))
#     sns.boxplot(y=col, data=data)
#     # Add labels and title
#     plt.xlabel(col)
#     plt.grid(True)
#     plt.show()

In [None]:
for col in data_tranformed.columns[1:]:
    plt.figure(figsize=(10, 6))
    sns.boxplot(y=col, data=data_tranformed)
    # Add labels and title
    plt.xlabel(col)
    plt.grid(True)
    plt.show()

In [None]:
for col in data_tranformed.columns[1:]:
    plt.figure(figsize=(10, 6))
    sns.histplot(data_tranformed[col], bins=50, kde=True)
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.title('Histogram and KDE of Box-Cox Transformed ' + col)
    plt.grid(True)
    plt.show()

In [None]:
cols_features = pd.DataFrame()

for col in data_tranformed.columns[1:]:
    col_head = (
        data_tranformed[
            data_tranformed[col] >= data_tranformed[col].quantile(0.99)]
            # .sort_values(by=col, ascending=False)
            .head(20)
            .reset_index(drop=True)
    )["site"]
    cols_features = pd.concat([cols_features, pd.DataFrame({col : col_head})], axis=1)

cols_features

In [None]:
features_names = {
    'F_1_transformed' : "social_media", 
    'F_2_transformed' : "news_social", 
    'F_3_transformed' : "?",
    'F_4_transformed' : "health_lifestyle", 
    'F_5_transformed' : "tech/games", 
    'F_6_transformed' : "lifestyle",
    'F_7_transformed' : "finance", 
    'F_8_transformed' : "food", 
    'F_9_transformed' : "sport",
    'F_10_transformed' : "games", 
    'F_11_transformed' : "medical/health", 
    'F_12_transformed' : "guns",
    'F_13_transformed' : "natures", 
    'F_14_transformed' : "lifestyle_2", 
    'F_15_transformed' : "apps",
    'F_16_transformed' : "news_2",
    'F_17_transformed' : "sport/lifestyle",
    'F_18_transformed' : "news",
    'F_19_transformed' : "pets", 
    'F_20_transformed' : "accademy", 
    'F_21_transformed' : "courses",
    'F_22_transformed' : "sport", 
    'F_23_transformed' : "travel", 
    'F_24_transformed' : "cars"
    }

In [None]:
for col in data_tranformed.columns[1:]:
    plt.figure(figsize=(10, 6))
    sns.histplot(data_tranformed[col].loc[data_tranformed[col] > 0], bins=50, kde=True)
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.title('Histogram and KDE of Box-Cox Transformed ' + col)
    plt.grid(True)
    plt.show()

## Correlation

In [None]:
# # Select numerical columns
# numerical_cols = data_tranformed.columns.difference(["site"])
# print("Num Cols: " + str(numerical_cols))
# numerical_df = data_tranformed[numerical_cols]
# numerical_df[numerical_df == 0] = np.nan
# corr_matrix = numerical_df.corr()

# plt.figure(figsize=(16, 12))
# sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
# plt.title('Correlation Matrix of Numerical Variables')
# plt.show()

In [None]:
# Select numerical columns
numerical_cols = data.columns.difference(["site"])
print("Num Cols: " + str(numerical_cols))
numerical_df = data[numerical_cols]
corr_matrix = numerical_df.corr()

plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()


In [None]:
# Select numerical columns
numerical_cols = data_tranformed.columns[data_tranformed.columns.str.contains("_transformed")]
print("Num Cols: " + str(numerical_cols))
numerical_df = data_tranformed[numerical_cols]
corr_matrix = numerical_df.corr()

plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

In [None]:
corr_cutoff = 0.7

numerical_cols = data_tranformed.columns[data_tranformed.columns.str.contains("_transformed")]
print("Num Cols: " + str(numerical_cols))
numerical_df = data_tranformed[numerical_cols]
corr_matrix = numerical_df.corr()

plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix[abs(corr_matrix) > corr_cutoff], annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

## PCA

In [None]:
#%% PCA
numerical_cols = data.columns.difference(["site"])
numerical_df = data[numerical_cols]

pca = PCA(n_components=4) 
principal_components = pca.fit_transform(numerical_df)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4'])

plt.figure(figsize=(10, 6))
plt.scatter(pca_df['PC1'], pca_df['PC2'], cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Numerical Data')
plt.grid(True)
plt.show()

In [None]:
#%% PCA
numerical_cols = data_tranformed.columns.difference(["site"])
numerical_df = data_tranformed[numerical_cols]

pca = PCA(n_components=4) 
principal_components = pca.fit_transform(numerical_df)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4'])

plt.figure(figsize=(10, 6))
plt.scatter(pca_df['PC1'], pca_df['PC2'], cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Numerical Transformed Data')
plt.grid(True)
plt.show()


Find number of optimal clusters using Elbow method and silhouette score

In [None]:
# 3D Plot
numerical_cols = data.columns.difference(["site"])
numerical_df = data[numerical_cols]
pca = PCA(n_components=4)  
principal_components = pca.fit_transform(numerical_df)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4'])

x_col = 'PC1'
y_col = 'PC2'
z_col = 'PC3'

fig = px.scatter_3d(pca_df, x=x_col, y=y_col, z=z_col, color=z_col,
                    title=f'3D Scatter Plot of {x_col} vs {y_col} vs {z_col}')
fig.update_layout(width=1000, height=800)
fig.show()

In [None]:
# 3D plot on Transformed Data

numerical_cols = data_tranformed.columns.difference(["site"])
numerical_df = data_tranformed[numerical_cols]

pca = PCA(n_components=4)
principal_components = pca.fit_transform(numerical_df)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4'])

x_col = 'PC1'
y_col = 'PC2'
z_col = 'PC3'

fig = px.scatter_3d(pca_df, x=x_col, y=y_col, z=z_col, color=z_col,
                    title=f'3D Scatter Plot of {x_col} vs {y_col} vs {z_col}')
fig.update_layout(width=1000, height=800)
fig.show()

## T-SNE

In [None]:
# t-SNE
df = data_tranformed.copy()
tsne = TSNE(n_components=2, perplexity=100, max_iter=300)
tsne_result = tsne.fit_transform(df.iloc[:, 1:])
df['TSNE1'] = tsne_result[:, 0]
df['TSNE2'] = tsne_result[:, 1]

plt.figure(figsize=(10, 7))
sns.scatterplot(x='TSNE1', y='TSNE2', data=df)
plt.title('t-SNE of Website Embeddings')
plt.show()

In [None]:
# t-SNE
df = data_tranformed.copy()
tsne = TSNE(n_components=3, perplexity=100, max_iter=300)
tsne_result = tsne.fit_transform(df.iloc[:, 1:])

df['TSNE1'] = tsne_result[:, 0]
df['TSNE2'] = tsne_result[:, 1]
df['TSNE3'] = tsne_result[:, 2]

x_col = 'TSNE1'
y_col = 'TSNE2'
z_col = 'TSNE3'

# Create a 3D scatter plot using Plotly
fig = px.scatter_3d(df, x=x_col, y=y_col, z=z_col, color=z_col,
                    title=f'3D Scatter Plot of {x_col} vs {y_col} vs {z_col}')

fig.update_layout(width=1000, height=800)

# Show the plot
fig.show()

## K-Meams

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score, davies_bouldin_score, silhouette_samples

class WordEmbeddingClustering:
    """
    A class for clustering word embeddings from a pandas DataFrame where
    the first column contains the words.
    """

    def __init__(self, embedding_df: pd.DataFrame):
        """
        Initialize with the embedding DataFrame.

        Parameters:
        -----------
        embedding_df : pd.DataFrame
            DataFrame where the first column is the word and the remaining columns are embedding dimensions.
        """
        self.embedding_df = embedding_df
        self.words = embedding_df.iloc[:, 0].tolist()
        self.embeddings = embedding_df.iloc[:, 1:].values
        self.normalized_embeddings = self.embeddings
        self.labels_ = None
        self.model = None

    def cluster_embeddings(self, method: str = 'kmeans', n_clusters: int = 10, **kwargs):
        """
        Perform clustering on the normalized embeddings.

        Parameters:
        -----------
        method : str
            Clustering algorithm ('kmeans', 'dbscan', or 'gmm').
        n_clusters : int
            Number of clusters (used for 'kmeans' and 'gmm').
        """
        if method == 'kmeans':
            self.model = KMeans(n_clusters=n_clusters, **kwargs)
        elif method == 'dbscan':
            self.model = DBSCAN(**kwargs)
        elif method == 'gmm':
            self.model = GaussianMixture(n_components=n_clusters, **kwargs)
        else:
            raise ValueError(f"Unknown method: {method}")
        
        self.labels_ = self.model.fit_predict(self.normalized_embeddings)
        self.labels_series = pd.Series(self.labels_, index=self.words, name='cluster')

    def get_cluster_words(self, n_words_per_cluster: int = 5):
        """
        Get representative words for each cluster.

        Parameters:
        -----------
        n_words_per_cluster : int
            Number of words per cluster to return.

        Returns:
        --------
        Dict[int, pd.DataFrame]
            Mapping of cluster labels to DataFrames of representative words.
        """
        if self.labels_ is None:
            raise ValueError("Clustering has not been performed yet.")
        
        cluster_words = {}
        unique_labels = np.unique(self.labels_)
        
        for label in unique_labels:
            if label == -1:
                continue  # Skip noise points
            cluster_indices = np.where(self.labels_ == label)[0]
            cluster_embeddings = self.embeddings[cluster_indices]
            cluster_words_list = [self.words[idx] for idx in cluster_indices]
            cluster_df = pd.DataFrame(cluster_embeddings, index=cluster_words_list)

            if hasattr(self.model, 'cluster_centers_'):
                center = self.model.cluster_centers_[label]
            elif hasattr(self.model, 'means_'):
                center = self.model.means_[label]
            else:
                center = cluster_embeddings.mean(axis=0)
            
            distances = np.linalg.norm(cluster_embeddings - center, axis=1)
            closest_indices = np.argsort(distances)[:n_words_per_cluster]
            representative_words = cluster_df.iloc[closest_indices]
            cluster_words[label] = representative_words
            
        return cluster_words

    def get_cluster_statistics(self):
        """
        Calculate basic statistics for each cluster.

        Returns:
        --------
        pd.DataFrame
            DataFrame with cluster statistics.
        """
        if self.labels_ is None:
            raise ValueError("Clustering has not been performed yet.")
        
        stats = []
        unique_labels = np.unique(self.labels_)
        
        for label in unique_labels:
            if label == -1:
                continue
            cluster_size = np.sum(self.labels_ == label)
            stats.append({
                'cluster': label,
                'size': cluster_size
            })
        
        return pd.DataFrame(stats).set_index('cluster')

    def evaluate_clustering(self):
        """
        Evaluate clustering using Silhouette Score and Davies-Bouldin Index,
        and provide per-cluster silhouette scores.

        Returns:
        --------
        Dict[str, Any]
            Dictionary with evaluation metrics, including per-cluster silhouette scores.
        """
        if self.labels_ is None:
            raise ValueError("Clustering has not been performed yet.")
        
        unique_labels = np.unique(self.labels_)
        if len(unique_labels) > 1:
            # Compute silhouette scores for all samples
            sample_silhouette_values = silhouette_samples(self.normalized_embeddings, self.labels_)
            # Overall silhouette score
            sil_score = sample_silhouette_values.mean()
            # Per-cluster silhouette scores
            cluster_silhouette_scores = {}
            for label in unique_labels:
                if label == -1:  # Skip noise points if using DBSCAN
                    continue
                cluster_mask = self.labels_ == label
                cluster_silhouette = sample_silhouette_values[cluster_mask].mean()
                cluster_silhouette_scores[label] = cluster_silhouette
            # Davies-Bouldin Index
            db_score = davies_bouldin_score(self.normalized_embeddings, self.labels_)
        else:
            sil_score = 0.0
            db_score = 0.0
            cluster_silhouette_scores = {label: 0.0 for label in unique_labels}
        
        return {
            'silhouette_score': sil_score,
            'davies_bouldin_index': db_score,
            'cluster_silhouette_scores': cluster_silhouette_scores
        }
    
    def find_optimal_clusters(self, k_range):
        from sklearn.metrics import silhouette_score, davies_bouldin_score

        wcss = []
        silhouette_scores = []
        db_scores = []
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            labels = kmeans.fit_predict(self.normalized_embeddings)
            wcss.append(kmeans.inertia_)
            silhouette_scores.append(silhouette_score(self.normalized_embeddings, labels))
            db_scores.append(davies_bouldin_score(self.normalized_embeddings, labels))
        
        metrics = {
            'k_values': list(k_range),
            'wcss': wcss,
            'silhouette_scores': silhouette_scores,
            'davies_bouldin_scores': db_scores
        }
        return metrics


In [None]:
clusterer = WordEmbeddingClustering(data_tranformed)
clusterer.cluster_embeddings(
    method='dbscan',
    n_clusters=7
)

evaluation = clusterer.evaluate_clustering()
print("\nClustering Evaluation:")
print(pd.DataFrame(evaluation, index=['value']).T)
print("----"*20)
print(pd.DataFrame.from_dict(evaluation["cluster_silhouette_scores"], orient='index', columns=['silhouette_score']))

In [None]:
clusterer.labels_series[clusterer.labels_series == 4]

In [None]:
k_range = range(2, 24)

# Find optimal clusters
metrics = clusterer.find_optimal_clusters(k_range)

# Plot Elbow Method
plt.figure(figsize=(8, 5))
plt.plot(metrics['k_values'], metrics['wcss'], 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.xticks(metrics['k_values'])
plt.grid(True)
plt.show()

# Plot Silhouette Analysis
plt.figure(figsize=(8, 5))
plt.plot(metrics['k_values'], metrics['silhouette_scores'], 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Average Silhouette Score')
plt.title('Silhouette Analysis')
plt.xticks(metrics['k_values'])
plt.grid(True)
plt.show()

# Plot Davies-Bouldin Index
plt.figure(figsize=(8, 5))
plt.plot(metrics['k_values'], metrics['davies_bouldin_scores'], 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Davies-Bouldin Index')
plt.title('Davies-Bouldin Index Analysis')
plt.xticks(metrics['k_values'])
plt.grid(True)
plt.show()

## Similarity Matrix

In [None]:
# Provedení analýzy
results = analyze_web_similarity(data_tranformed)

In [None]:
results["similarity_dict"]["wine.com"]

# Analyza Nul

In [None]:
nuly = data_tranformed.copy()

In [None]:
maska_nul = nuly == 0
nuly[maska_nul] = 1
nuly[~maska_nul] = 0

Slopce, ktery korelujou svyma nulama a zbytekem jsou pravdepodobne pocet navstevnosti a delka stravena na strance.

In [None]:
nuly_corr = nuly.drop("site", axis=1).corr()
plt.figure(figsize=(16, 12))
sns.heatmap(nuly_corr[abs(nuly_corr)>0.5], annot=True, fmt='.2f', cmap='coolwarm')

Jeste se podivame na Jaccardovu podobnost nul

In [None]:
from sklearn.metrics import pairwise_distances

# Compute Jaccard distance for all rows
jaccard_distances = pairwise_distances(nuly.drop("site", axis=1).values, metric="jaccard")

# Convert distance to similarity (1 - distance)
jaccard_similarities = 1 - jaccard_distances

# Create a similarity DataFrame
similarity_jaccard_df = pd.DataFrame(jaccard_similarities, index=nuly.drop("site", axis=1).index, columns=nuly.drop("site", axis=1).index)

print(similarity_jaccard_df)


In [None]:
similarity_dict = {}
n_similar = 5
for i, website in enumerate(data_tranformed["site"]):
    # Najdeme nejpodobnější stránky (kromě sebe sama)
    similar_indices = similarity_jaccard_df[i].argsort()[::-1][1:n_similar+1]
    similar_websites = [
        {
            'web': data_tranformed["site"].iloc[idx],
            'jaccard_similarity': similarity_jaccard_df[i][idx]
        }
        for idx in similar_indices
    ]
    similarity_dict[website] = similar_websites

In [None]:
similarity_dict["wine.com"] 

# Dalsi kroky

* H2O automl pro clustering
* pomoci elbow metody zjistit pocet clusteru, metrika silhouette
* 