In [None]:
from pathlib import Path
from collections import Counter

paths = [
    Path("/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/data/agh_edu_s1_filtered/KEEP"),
    Path("/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/data/agh_edu_s2_filtered/KEEP"),
    Path("/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/data/agh_edu_s3_filtered/KEEP"),
    Path("/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/data/agh_edu_s4_filtered/KEEP"),
    Path("/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/data/agh_edu_s5_filtered/KEEP"),
    Path("/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/data/agh_edu_s6_filtered/KEEP"),
    Path("/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/data/agh_edu_s7_filtered/KEEP")
]
import os
import json

counter = Counter()

files = []
for path in paths:
    for file in os.listdir(path):
        with open(path / file, "r") as f:
            data = json.load(f)

        files.append(data)

In [None]:
def get_domain(file):
    url = file["metadata"].get("url", "Not Found")
    if url == "Not Found":
        return url
    else:
        domain = url.split("/")[2]
        if domain.startswith("www."):
            domain = domain[4:]
        return domain

Counter([get_domain(f) for f in filtered_files])

In [None]:
rejected_domains = [
    "rekrutacja.agh.edu.pl",
    "dss.agh.edu.pl",
    "sylabusy.agh.edu.pl",
    "miasteczko.agh.edu.pl",
    "akademik.agh.edu.pl",
    "dss.agh.edu.pl",
    "sylabusy.agh.edu.pl",
    "historia_agh.agh.edu.pl"
]

filtered_files = [file for file in files if get_domain(file) not in rejected_domains]

In [None]:
history_agh = [file for file in files if get_domain(file) == "historia_agh.agh.edu.pl"]

for i, file in enumerate(history_agh):
    with open(f"/collections/historia_agh/{i}.json", "w") as f:
        json.dump(file, f)

In [None]:
for i, file in enumerate(filtered_files[:1100]):
    with open(f"/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/agh_edu_1/{i}.json", "w") as f:
        json.dump(file, f)

for i, file in enumerate(filtered_files[1100:]):
    with open(f"/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/agh_edu_2/{i}.json", "w") as f:
        json.dump(file, f)

In [None]:
# for i, file in enumerate(filtered_files[:1200]):
#     with open(f"/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/agh_edu1/{i}.json", "w") as f:
#         json.dump(file, f)

for i, file in enumerate(filtered_files[:500]):
    with open(f"/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/agh_edu_1/{i}.json", "w") as f:
        json.dump(file, f)

for i, file in enumerate(filtered_files[500:1000]):
    with open(f"/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/agh_edu_2/{i}.json", "w") as f:
        json.dump(file, f)

for i, file in enumerate(filtered_files[1000:1500]):
    with open(f"/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/agh_edu_3/{i}.json", "w") as f:
        json.dump(file, f)

for i, file in enumerate(filtered_files[1500:]):
    with open(f"/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/agh_edu_4/{i}.json", "w") as f:
        json.dump(file, f)

In [None]:
path = "/Users/wnowogorski/PycharmProjects/CHAT_AGH/src/collections/sylabusy_agh/"

for file in os.listdir(path):
    if file.endswith(".md"):
        with open(path + file, "r") as f:
            content = f.read()

        metadata_name = file.split(".")[0]
        with open(path + metadata_name + "_meta.json", "r") as f:
            metadata = json.load(f)

        with open(f"{metadata_name}.json", "w") as f:
            json.dump({
                "content": content,
                "metadata": metadata,
            }, f)

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
import os
from typing import List, Dict, Any

# Extract domains and prepare data for clustering
processed_data = []

for item in tqdm(filtered_files):
    # Extract domain
    domain = get_domain(item)

    # Get content
    content = item.get("content", "")

    # Only include items that have content
    if content and len(content.strip()) > 0:
        processed_data.append({
            "domain": domain,
            "content": content,
            "metadata": item.get("metadata", {})
        })

print(f"Processed {len(processed_data)} documents with content")

# Create a DataFrame for easier manipulation
df = pd.DataFrame(processed_data)

# Check domain distribution
domain_counts = df['domain'].value_counts()
print(f"Found {len(domain_counts)} unique domains")
domain_counts.head(10)

In [None]:
# Create embeddings using SentenceTransformers
model = SentenceTransformer('all-MiniLM-L6-v2')  # Using a smaller model for speed, you can use 'all-mpnet-base-v2' for better quality

# Generate embeddings (this might take some time for large datasets)
print("Generating embeddings...")
embeddings = []

# Process in batches to avoid memory issues
batch_size = 32
for i in tqdm(range(0, len(df), batch_size)):
    batch = df['content'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch)
    embeddings.extend(batch_embeddings)

embeddings = np.array(embeddings)
print(f"Generated embeddings with shape: {embeddings.shape}")

In [None]:
# Determine optimal number of clusters using the elbow method
def plot_elbow(embeddings, max_k=15):
    inertias = []
    for k in range(1, max_k+1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(embeddings)
        inertias.append(kmeans.inertia_)

    plt.figure(figsize=(10, 6))
    plt.plot(range(1, max_k+1), inertias, marker='o')
    plt.title('Elbow Method for Optimal k')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.grid(True)
    plt.show()

# Run elbow method to find optimal k
plot_elbow(embeddings)

# Set the number of clusters (adjust based on elbow plot)
n_clusters = 10  # Change this based on the elbow plot

# Run KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(embeddings)

# Add cluster assignments to DataFrame
df['cluster'] = clusters

In [None]:
# Visualize clusters using PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Create a DataFrame for visualization
viz_df = pd.DataFrame({
    'x': reduced_embeddings[:, 0],
    'y': reduced_embeddings[:, 1],
    'cluster': clusters,
    'domain': df['domain'].values
})

# Plot the clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='x', y='y', hue='cluster', data=viz_df, palette='viridis', s=50, alpha=0.7)
plt.title('Document Clusters Visualization')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

In [None]:
# Function to get top words for each cluster
def get_cluster_keywords(df, cluster_id, top_n=10):
    from sklearn.feature_extraction.text import TfidfVectorizer

    # Get documents in the cluster
    cluster_docs = df[df['cluster'] == cluster_id]['content'].tolist()

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(cluster_docs)

    # Get top words based on TF-IDF scores
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).A1

    # Sort words by TF-IDF scores
    top_words_idx = tfidf_scores.argsort()[-top_n:][::-1]
    top_words = [feature_names[i] for i in top_words_idx]

    return top_words

# Analyze each cluster
for cluster_id in range(n_clusters):
    cluster_size = len(df[df['cluster'] == cluster_id])
    print(f"\nCluster {cluster_id} - Size: {cluster_size} documents ({cluster_size/len(df)*100:.2f}%)")

    # Top keywords
    keywords = get_cluster_keywords(df, cluster_id)
    print(f"Top keywords: {', '.join(keywords)}")

    # Domain distribution within cluster
    cluster_domains = df[df['cluster'] == cluster_id]['domain'].value_counts()
    print(f"Number of unique domains in cluster: {len(cluster_domains)}")
    print("Top domains in this cluster:")
    for domain, count in cluster_domains.head(5).items():
        print(f"  - {domain}: {count} documents ({count/cluster_size*100:.2f}% of cluster)")

In [None]:
# Create a domain-cluster matrix - FIXED to ensure numeric data types
def analyze_domain_cluster_distribution(df, n_clusters):
    # Get domains that appear in at least 5 documents
    common_domains = df['domain'].value_counts()[df['domain'].value_counts() >= 5].index.tolist()

    # Create a domain-cluster matrix with explicit numeric dtype
    domain_cluster_matrix = pd.DataFrame(index=common_domains, columns=range(n_clusters), dtype=float)

    # Initialize with zeros to ensure numeric type
    for column in range(n_clusters):
        domain_cluster_matrix[column] = 0.0

    for domain in common_domains:
        domain_docs = df[df['domain'] == domain]
        domain_clusters = domain_docs['cluster'].value_counts()

        for cluster in range(n_clusters):
            # Calculate percentage of domain's documents in each cluster
            count = domain_clusters.get(cluster, 0)
            if len(domain_docs) > 0:  # Avoid division by zero
                domain_cluster_matrix.loc[domain, cluster] = float(count / len(domain_docs) * 100)

    # Ensure all data is numeric type
    domain_cluster_matrix = domain_cluster_matrix.astype(float)
    return domain_cluster_matrix

# Generate and display the matrix
domain_cluster_matrix = analyze_domain_cluster_distribution(df, n_clusters)

# Check to make sure we have numeric data
print("Matrix data type:", domain_cluster_matrix.dtypes[0])

# Visualize the matrix as a heatmap
plt.figure(figsize=(12, max(8, len(domain_cluster_matrix) * 0.4)))
sns.heatmap(domain_cluster_matrix, cmap='YlGnBu', annot=True, fmt='.1f')
plt.title('Domain Distribution Across Clusters (%)')
plt.xlabel('Cluster')
plt.ylabel('Domain')
plt.tight_layout()
plt.show()

In [None]:
# Create an alternative visualization of domain-cluster distribution
# This approach is more robust against type issues

# For each domain with at least 5 documents, show distribution across clusters
top_domains = df['domain'].value_counts()[df['domain'].value_counts() >= 5].index.tolist()[:15]  # Limit to top 15

# Create DataFrame for plotting
plot_data = []
for domain in top_domains:
    domain_docs = df[df['domain'] == domain]
    total_docs = len(domain_docs)
    for cluster in range(n_clusters):
        cluster_docs = domain_docs[domain_docs['cluster'] == cluster]
        percentage = len(cluster_docs) / total_docs * 100 if total_docs > 0 else 0
        plot_data.append({
            'Domain': domain,
            'Cluster': f'Cluster {cluster}',
            'Percentage': percentage
        })

plot_df = pd.DataFrame(plot_data)

# Plot the distribution
plt.figure(figsize=(14, 10))
domain_cluster_plot = sns.barplot(x='Domain', y='Percentage', hue='Cluster', data=plot_df)
plt.title('Domain Distribution Across Clusters')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Percentage of Domain Documents (%)')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()

In [None]:
# Calculate Silhouette Score to evaluate clustering quality
from sklearn.metrics import silhouette_score, calinski_harabasz_score

silhouette_avg = silhouette_score(embeddings, clusters)
calinski_harabasz_avg = calinski_harabasz_score(embeddings, clusters)

print(f"Silhouette Score: {silhouette_avg:.4f}")
print(f"Calinski-Harabasz Score: {calinski_harabasz_avg:.4f}")

# Higher silhouette scores (closer to 1) indicate better-defined clusters
# Higher Calinski-Harabasz scores indicate better cluster separation

In [None]:
# Find domains that are predominantly in one cluster (cluster specificity)
def analyze_domain_specificity(df, n_clusters):
    # Get domains with at least 5 documents
    domains = df['domain'].value_counts()[df['domain'].value_counts() >= 5].index

    domain_specificity = []
    for domain in domains:
        domain_docs = df[df['domain'] == domain]
        total_docs = len(domain_docs)

        # Get distribution across clusters
        cluster_distribution = domain_docs['cluster'].value_counts(normalize=True) * 100

        # Find dominant cluster
        dominant_cluster = cluster_distribution.idxmax()
        dominant_percentage = cluster_distribution.max()

        domain_specificity.append({
            'domain': domain,
            'dominant_cluster': dominant_cluster,
            'dominant_percentage': dominant_percentage,
            'total_docs': total_docs
        })

    # Convert to DataFrame and sort by specificity
    specificity_df = pd.DataFrame(domain_specificity)
    return specificity_df.sort_values('dominant_percentage', ascending=False)

# Get domain specificity
domain_specificity = analyze_domain_specificity(df, n_clusters)

# Show the most cluster-specific domains
print("Domains most strongly associated with a single cluster:")
display(domain_specificity.head(15))

# Find the most representative documents for each cluster
def get_representative_docs(df, kmeans, embeddings, n=3):
    """Get the documents closest to each cluster centroid"""
    centers = kmeans.cluster_centers_

    representative_docs = []

    for i in range(len(centers)):
        # Calculate distance from each document to the cluster center
        distances = np.linalg.norm(embeddings - centers[i], axis=1)

        # Get indices of documents closest to the center
        closest_indices = np.argsort(distances)[:n]

        # Get those documents
        cluster_docs = df.iloc[closest_indices][['domain', 'content']]

        for _, doc in cluster_docs.iterrows():
            # Truncate content for display
            content = doc['content']
            if len(content) > 300:
                content = content[:300] + "..."

            representative_docs.append({
                'cluster': i,
                'domain': doc['domain'],
                'content': content
            })

    return pd.DataFrame(representative_docs)

# Get representative documents
rep_docs = get_representative_docs(df, kmeans, embeddings)

# Display representative documents by cluster
for cluster in range(n_clusters):
    print(f"\n=== CLUSTER {cluster} REPRESENTATIVE DOCUMENTS ===")
    cluster_docs = rep_docs[rep_docs['cluster'] == cluster]
    for i, (_, doc) in enumerate(cluster_docs.iterrows()):
        print(f"\nDocument {i+1} from {doc['domain']}:")
        print(doc['content'])
        print("-" * 80)