## 1. ライブラリのインポート

In [1]:
%load_ext autoreload
%autoreload 2

# Install required packages
import subprocess
import sys

print("Installing required packages...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "hdbscan", "umap-learn"])
print("[OK] Packages installed\n")

import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from pathlib import Path

from loader.huggingface_loader import HuggingFaceLoader
from loader.csv_loader import CSVLoader
from preprocessor.text_cleaner import TextCleaner
from preprocessor.tokenizer import Tokenizer
from analysis.vectorizer import TextVectorizer
from analysis.clustering import TopicClusterer
from analysis.cooccurrence import CooccurrenceNetwork
from visualization.visualizer import EmbeddingVisualizer

print("[OK] All libraries imported successfully")

Installing required packages...
[OK] Packages installed

[OK] Packages installed



  from .autonotebook import tqdm as notebook_tqdm


[OK] All libraries imported successfully


## 2. サンプルデータの生成

In [2]:
# サンプルチャットデータを生成
sample_chats = [
    "Python は素晴らしいプログラミング言語です",
    "機械学習と深層学習は AI の中核です",
    "Python は機械学習ライブラリが充実しています",
    "ディープニューラルネットワークは強力です",
    "データ分析は統計学に基づいています",
    "Python は データ科学に最適です",
    "学習アルゴリズムの最適化は重要です",
    "分類問題と回帰問題があります",
    "自然言語処理は テキスト分析に使われます",
    "ベクトル化はテキストを数値に変換します",
    "クラスタリングはデータのグループ化です",
    "可視化は結果の理解を助けます",
]

df = pd.DataFrame({
    'user_id': [f'user_{i % 3}' for i in range(len(sample_chats))],
    'timestamp': pd.date_range('2024-01-01', periods=len(sample_chats), freq='H'),
    'message': sample_chats
})

print(f"[OK] Sample data created: {len(df)} messages")
print(df.head())

[OK] Sample data created: 12 messages
  user_id           timestamp                    message
0  user_0 2024-01-01 00:00:00   Python は素晴らしいプログラミング言語です
1  user_1 2024-01-01 01:00:00        機械学習と深層学習は AI の中核です
2  user_2 2024-01-01 02:00:00  Python は機械学習ライブラリが充実しています
3  user_0 2024-01-01 03:00:00       ディープニューラルネットワークは強力です
4  user_1 2024-01-01 04:00:00          データ分析は統計学に基づいています


  'timestamp': pd.date_range('2024-01-01', periods=len(sample_chats), freq='H'),


## 3. テキスト前処理

In [3]:
# テキストクリーニング
df['cleaned_message'] = df['message'].apply(TextCleaner.clean)

print("[OK] Text cleaning completed")
print("\nBefore/After comparison:")
for i in range(3):
    print(f"\nOriginal:  {df.iloc[i]['message']}")
    print(f"Cleaned:   {df.iloc[i]['cleaned_message']}")

[OK] Text cleaning completed

Before/After comparison:

Original:  Python は素晴らしいプログラミング言語です
Cleaned:   Python は素晴らしいプログラミング言語です

Original:  機械学習と深層学習は AI の中核です
Cleaned:   機械学習と深層学習は AI の中核です

Original:  Python は機械学習ライブラリが充実しています
Cleaned:   Python は機械学習ライブラリが充実しています


## 4. トークン化

In [4]:
# トークン化
tokenizer = Tokenizer(use_unidic=True)

df['tokens'] = df['cleaned_message'].apply(tokenizer.tokenize)

print("[OK] Tokenization completed")
print("\nSample tokenization:")
for i in range(3):
    print(f"{df.iloc[i]['cleaned_message']}")
    print(f"  Tokens: {df.iloc[i]['tokens']}")
    print()

[OK] Tokenization completed

Sample tokenization:
Python は素晴らしいプログラミング言語です
  Tokens: ['Python', 'は', '素晴らしい', 'プログラミング', '言語', 'です']

機械学習と深層学習は AI の中核です
  Tokens: ['機械', '学習', 'と', '深層', '学習', 'は', 'AI', 'の', '中核', 'です']

Python は機械学習ライブラリが充実しています
  Tokens: ['Python', 'は', '機械', '学習', 'ライブラリ', 'が', '充実', 'し', 'て', 'い', 'ます']



## 5. ベクトル化（Sentence-Transformers）

In [5]:
# ベクトル化（モック実装を使用）
vectorizer = TextVectorizer(use_mock=True)

texts = df['cleaned_message'].tolist()
embeddings = vectorizer.encode(texts, normalize=True)

print(f"[OK] Vectorization completed")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Sample embedding (first 10 dims): {embeddings[0][:10]}")

[OK] TextVectorizer initialized: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
[OK] Embedding dimension: 768
[OK] Device: cpu
[WARN] Using mock implementation for testing
[OK] Vectorization completed
Embeddings shape: (12, 768)
Sample embedding (first 10 dims): [ 5.1080761e-03 -1.6294312e-02 -6.3799584e-05  3.3117939e-02
  6.3065194e-02 -1.5186901e-02 -3.3560134e-02 -1.8570506e-03
  2.4474079e-02 -4.9794219e-02]


## 6. クラスタリング（HDBSCAN）

In [7]:
# クラスタリング - パラメータを調整してクラスタ形成を促進
clusterer = TopicClusterer(min_cluster_size=2, min_samples=1)
clusterer.fit(embeddings)

df['cluster'] = clusterer.get_cluster_labels()

print(f"[OK] Clustering completed")
print(f"Number of clusters: {clusterer.n_clusters}")
print(f"\nCluster distribution:")
print(df['cluster'].value_counts().sort_index())

# クラスタ統計
stats = clusterer.get_cluster_stats()
print(f"\nCluster statistics:")
for cluster_id, stat in stats.items():
    if cluster_id != 'noise':
        print(f"  Cluster {cluster_id}: {stat['size']} items, mean_dist={stat['mean_distance']:.3f}")

[OK] TopicClusterer initialized: min_cluster_size=2, metric=euclidean
[OK] HDBSCAN fitting completed
[OK] Clusters: 4, Noise points: 4
[OK] Clustering completed
Number of clusters: 4

Cluster distribution:
cluster
-1    4
 0    2
 1    2
 2    2
 3    2
Name: count, dtype: int64

Cluster statistics:
  Cluster 0: 2 items, mean_dist=0.683
  Cluster 1: 2 items, mean_dist=0.690
  Cluster 2: 2 items, mean_dist=0.689
  Cluster 3: 2 items, mean_dist=0.665


## 7. UMAP 次元圧縮と可視化

In [8]:
# UMAP で2次元に圧縮
visualizer = EmbeddingVisualizer(n_neighbors=5, min_dist=0.1, metric='euclidean')
embeddings_2d = visualizer.fit_transform(embeddings)

print(f"[OK] UMAP transformation completed")
print(f"2D embeddings shape: {embeddings_2d.shape}")
print(f"X range: [{embeddings_2d[:, 0].min():.3f}, {embeddings_2d[:, 0].max():.3f}]")
print(f"Y range: [{embeddings_2d[:, 1].min():.3f}, {embeddings_2d[:, 1].max():.3f}]")

[OK] EmbeddingVisualizer initialized: n_neighbors=5, metric=euclidean
[OK] UMAP fit_transform completed
[OK] Embeddings shape: (12, 2)
[OK] UMAP transformation completed
2D embeddings shape: (12, 2)
X range: [2.575, 5.671]
Y range: [12.015, 14.556]
[OK] UMAP fit_transform completed
[OK] Embeddings shape: (12, 2)
[OK] UMAP transformation completed
2D embeddings shape: (12, 2)
X range: [2.575, 5.671]
Y range: [12.015, 14.556]


## 8. インタラクティブな散布図（クラスタ色分け）

In [9]:
# インタラクティブ散布図
fig = visualizer.plot_scatter(
    embeddings_2d=embeddings_2d,
    texts=texts,
    labels=df['cluster'].values,
    title="Chat Log Clustering Visualization (UMAP + HDBSCAN)",
    height=700,
    width=1000
)

fig.show()

print("[OK] Interactive scatter plot displayed")

[OK] Scatter plot created: 12 points


[OK] Interactive scatter plot displayed


## 9. 共起分析

In [10]:
# 共起ネットワーク構築
tokenized_docs = df['tokens'].tolist()
network = CooccurrenceNetwork(window_size=3, min_frequency=1)
graph = network.build_network(tokenized_docs)

print(f"[OK] Co-occurrence network built")
print(f"Network nodes: {graph.number_of_nodes()}")
print(f"Network edges: {graph.number_of_edges()}")

# トップキーワード
top_keywords = network.get_keywords_by_frequency(top_n=10)
print(f"\nTop 10 keywords by frequency:")
print(top_keywords)

[OK] CooccurrenceNetwork initialized: window_size=3, min_freq=1
[OK] Cooccurrence matrix built: 54 unique words, 202 edges
[OK] Network built: 54 nodes, 202 edges
[OK] Co-occurrence network built
Network nodes: 54
Network edges: 202

Top 10 keywords by frequency:
    word  frequency
1      は        106
28     に         46
11     の         44
6     学習         38
5     です         36
19    ます         36
33     化         36
24   データ         26
41  テキスト         24
45     を         22


## 10. 共起エッジの表示

In [11]:
# トップ共起エッジ
top_edges = network.get_top_edges(top_n=10)
print("Top 10 co-occurring word pairs:")
print(top_edges)

Top 10 co-occurring word pairs:
      word1 word2  frequency
26        の     は          8
0    Python     は          6
71        は   データ          6
113       は     化          6
109       の     化          6
78        に     は          6
12       学習    機械          4
32       です     の          4
15        と    学習          4
22        は    学習          4


## 11. 中心性指標

In [14]:
# 中心性指標
degree = network.get_node_degree()
strength = network.get_node_strength()
betweenness = network.get_betweenness_centrality()

print("Network centrality measures:")
print(f"\nTop 5 nodes by degree:")
top_degree = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:5]
for word, deg in top_degree:
    print(f"  {word}: {deg}")

print(f"\nTop 5 nodes by strength:")
top_strength = sorted(strength.items(), key=lambda x: x[1], reverse=True)[:5]
for word, str_val in top_strength:
    print(f"  {word}: {str_val:.3f}")

Network centrality measures:

Top 5 nodes by degree:
  は: 37
  に: 19
  です: 15
  の: 15
  学習: 14

Top 5 nodes by strength:
  は: 106.000
  に: 46.000
  の: 44.000
  です: 36.000
  学習: 36.000


## 12. クラスタ要約

In [15]:
# クラスタ要約
summary = clusterer.get_cluster_summary(texts, top_n=3)

print("Cluster Summary:")
for cluster_id, info in summary.items():
    print(f"\n=== Cluster {cluster_id} ===")
    print(f"Size: {info['size']}")
    print(f"Representative texts:")
    for i, text in enumerate(info['representative_texts'], 1):
        print(f"  {i}. {text[:50]}..." if len(text) > 50 else f"  {i}. {text}")

Cluster Summary:

=== Cluster 0 ===
Size: 2
Representative texts:
  1. ディープニューラルネットワークは強力です
  2. 可視化は結果の理解を助けます

=== Cluster 1 ===
Size: 2
Representative texts:
  1. Python は機械学習ライブラリが充実しています
  2. Python は データ科学に最適です

=== Cluster 2 ===
Size: 2
Representative texts:
  1. 学習アルゴリズムの最適化は重要です
  2. クラスタリングはデータのグループ化です

=== Cluster 3 ===
Size: 2
Representative texts:
  1. 機械学習と深層学習は AI の中核です
  2. 分類問題と回帰問題があります


## 13. 結果サマリー

In [16]:
print("\n" + "="*70)
print("Chat Log Analysis Summary")
print("="*70)

print(f"\nData Processing:")
print(f"  - Total messages: {len(df)}")
print(f"  - Unique users: {df['user_id'].nunique()}")

print(f"\nVectorization:")
print(f"  - Embedding dimension: {embeddings.shape[1]}")
print(f"  - Model: {vectorizer.model_name}")

print(f"\nClustering (HDBSCAN):")
print(f"  - Number of clusters: {clusterer.n_clusters}")
print(f"  - Noise points: {(df['cluster'] == -1).sum()}")
if clusterer.get_silhouette_score() is not None:
    print(f"  - Silhouette score: {clusterer.get_silhouette_score():.3f}")

print(f"\nVisualization:")
print(f"  - Dimensionality reduction: UMAP")
print(f"  - Final dimensions: 2D")
print(f"  - Interactive: Yes (Plotly)")

print(f"\nCo-occurrence Analysis:")
print(f"  - Total unique words: {network.graph.number_of_nodes()}")
print(f"  - Co-occurrence edges: {network.graph.number_of_edges()}")
print(f"  - Most frequent word: {top_keywords.iloc[0]['word']} ({top_keywords.iloc[0]['frequency']} occurrences)")

print("\n" + "="*70)
print("Analysis Complete!")
print("="*70)


Chat Log Analysis Summary

Data Processing:
  - Total messages: 12
  - Unique users: 3

Vectorization:
  - Embedding dimension: 768
  - Model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2

Clustering (HDBSCAN):
  - Number of clusters: 4
  - Noise points: 4
  - Silhouette score: 0.033

Visualization:
  - Dimensionality reduction: UMAP
  - Final dimensions: 2D
  - Interactive: Yes (Plotly)

Co-occurrence Analysis:
  - Total unique words: 54
  - Co-occurrence edges: 202
  - Most frequent word: は (106 occurrences)

Analysis Complete!
