# ML Training Pipeline - GitHub Repository Recommendations

This notebook tests our ML recommendation system:
- **Data Loading** from PROJECT_training table
- **Feature Engineering** (TF-IDF, topics, numerical features)
- **Similarity Calculation** (cosine similarity matrix)
- **Model Training & Persistence** 
- **Recommendation Testing** (get similar projects)
- **Performance Evaluation** (initial metrics)


In [1]:
# Setup and Data Loading
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import os

# Load data from PROJECT_training table
print("Loading preprocessed data from PROJECT_training table...")

engine = create_engine(os.getenv('DATABASE_URL', 'postgresql://user:password@localhost:5434/ost_db'))

# Load training data
query = 'SELECT * FROM "PROJECT_training"'
df = pd.read_sql(query, engine)

print(f"✅ Loaded {len(df)} projects from PROJECT_training table")
print(f"Columns: {list(df.columns)}")
print(f"Languages: {df['language'].value_counts().to_dict()}")

# Show sample data
print("\nSample preprocessed data:")
df.head(3)


Loading preprocessed data from PROJECT_training table...
✅ Loaded 476 projects from PROJECT_training table
Columns: ['id', 'title', 'description', 'readme', 'language', 'topics', 'html_url', 'stargazers_count', 'forks_count', 'open_issues_count', 'pushed_at', 'text_features', 'created_at']
Languages: {'Python': 85, 'TypeScript': 84, 'JavaScript': 83, 'Go': 78, 'Rust': 74, 'Java': 72}

Sample preprocessed data:


Unnamed: 0,id,title,description,readme,language,topics,html_url,stargazers_count,forks_count,open_issues_count,pushed_at,text_features,created_at
0,56f66098-37f1-44db-a432-2a565f0cb4e0,awolfly9/IPProxyTool,ipproxytool python ip proxy tool scrapy crawl ...,# IPProxyTool\n使用 scrapy 爬虫抓取代理网站，获取大量的免费代理 ip...,Python,"proxy,python",https://github.com/awolfly9/IPProxyTool,1996,0,12,2022-12-08 07:42:07,,2025-07-19 14:19:36.338564
1,c277d2ee-dd5f-40c5-a43a-fd513352c3e2,jesolem/PCV,pcv open source python module for computer vision,## About PCV\nPCV is a pure Python library for...,Python,,https://github.com/jesolem/PCV,1946,0,27,2020-12-28 00:44:46,,2025-07-19 14:19:36.338564
2,683f349e-f45b-4009-97c7-b3058c1ab1b1,trycua/cua,cua c ua is the docker container for computer ...,"<div align=""center"">\n <picture>\n <source...",Python,"macos,swift,agent,windows",https://github.com/trycua/cua,9011,0,60,2025-07-17 18:25:40,,2025-07-19 14:19:36.338564


In [None]:
# Feature Engineering Pipeline
from infrastructure.analysis.feature_engineer import FeatureEngineer

print("=== FEATURE ENGINEERING ===")

# Convert DataFrame to list of Project-like objects for the pipeline
projects_data = []
for _, row in df.iterrows():
    project_dict = {
        'id': row['id'],
        'title': row['title'],
        'description': row['description'],  # This contains our cleaned combined text
        'language': row['language'],
        'topics': row['topics'],
        'stargazers_count': row['stargazers_count'],
        'open_issues_count': row['open_issues_count']
    }
    projects_data.append(project_dict)

# Initialize feature engineer
feature_engineer = FeatureEngineer()

print("Training feature engineering pipeline...")
feature_matrix = feature_engineer.fit_transform(projects_data)

print(f"✅ Feature matrix shape: {feature_matrix.shape}")
print(f"✅ Features per project: {feature_matrix.shape[1]}")

# Show feature engineering details
print(f"\n📊 TF-IDF vocabulary size: {len(feature_engineer.tfidf_vectorizer.vocabulary_)}")
print(f"📊 Topics vocabulary size: {len(feature_engineer.topic_encoder.classes_)}")
print(f"📊 Numerical features: {feature_engineer.numerical_features}")

# Sample of top TF-IDF terms
feature_names = feature_engineer.tfidf_vectorizer.get_feature_names_out()
print(f"\nTop 10 TF-IDF terms: {feature_names[:10].tolist()}")


In [None]:
# Similarity Calculation
from infrastructure.analysis.similarity_calculator import SimilarityCalculator

print("=== SIMILARITY CALCULATION ===")

# Initialize similarity calculator
similarity_calculator = SimilarityCalculator()

print("Computing similarity matrix...")
similarity_matrix = similarity_calculator.compute_similarity(feature_matrix)

print(f"✅ Similarity matrix shape: {similarity_matrix.shape}")
print(f"✅ Matrix density: {(similarity_matrix > 0).mean():.3f}")
print(f"✅ Average similarity: {similarity_matrix.mean():.4f}")
print(f"✅ Max similarity (excluding diagonal): {np.fill_diagonal(similarity_matrix.copy(), 0) or similarity_matrix.max():.4f}")

# Show similarity distribution
import matplotlib.pyplot as plt

# Plot similarity distribution
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
# Flatten and remove diagonal (self-similarity = 1.0)
similarities = similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]
plt.hist(similarities, bins=50, alpha=0.7, edgecolor='black')
plt.title('Similarity Distribution')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
# Heatmap of similarity matrix (sample)
sample_size = min(20, len(similarity_matrix))
sample_matrix = similarity_matrix[:sample_size, :sample_size]
plt.imshow(sample_matrix, cmap='viridis')
plt.title(f'Similarity Matrix (first {sample_size}x{sample_size})')
plt.colorbar()

plt.tight_layout()
plt.show()

print(f"\n📊 Similarity stats:")
print(f"   - Mean: {similarities.mean():.4f}")
print(f"   - Std:  {similarities.std():.4f}")
print(f"   - Min:  {similarities.min():.4f}")
print(f"   - Max:  {similarities.max():.4f}")


In [None]:
# Model Persistence
from infrastructure.analysis.model_persistence_service import ModelPersistenceService

print("=== MODEL PERSISTENCE ===")

# Initialize model persistence service
model_service = ModelPersistenceService()

# Prepare training data and models for saving
training_data = {
    'feature_engineer': feature_engineer,
    'similarity_matrix': similarity_matrix,
    'projects_data': projects_data,
    'feature_matrix': feature_matrix
}

print("Saving trained models...")
model_path = model_service.save_model(training_data, "github_recommendations_v1")

print(f"✅ Models saved to: {model_path}")

# Test model loading
print("\nTesting model loading...")
loaded_data = model_service.load_model("github_recommendations_v1")

print(f"✅ Loaded models:")
print(f"   - Feature engineer: {type(loaded_data['feature_engineer'])}")
print(f"   - Similarity matrix shape: {loaded_data['similarity_matrix'].shape}")
print(f"   - Projects count: {len(loaded_data['projects_data'])}")

# Verify models work after loading
print(f"\n🔧 Model verification:")
print(f"   - TF-IDF vocab size: {len(loaded_data['feature_engineer'].tfidf_vectorizer.vocabulary_)}")
print(f"   - Topics classes: {len(loaded_data['feature_engineer'].topic_encoder.classes_)}")
print(f"   - Similarity matrix max: {loaded_data['similarity_matrix'].max():.4f}")


In [None]:
# Recommendation Testing
print("=== RECOMMENDATION TESTING ===")

def get_recommendations(project_idx, top_k=5):
    """Get top-k similar projects for a given project index"""
    # Get similarity scores for the project
    similarities = similarity_matrix[project_idx]
    
    # Get indices of most similar projects (excluding itself)
    similar_indices = np.argsort(similarities)[::-1][1:top_k+1]
    
    # Get project details
    recommendations = []
    for idx in similar_indices:
        project = projects_data[idx]
        score = similarities[idx]
        recommendations.append({
            'project': project,
            'similarity_score': score,
            'index': idx
        })
    
    return recommendations

# Test recommendations for different types of projects
test_projects = [0, 1, 2, 10, 20]  # Sample different projects

for project_idx in test_projects:
    if project_idx >= len(projects_data):
        continue
        
    target_project = projects_data[project_idx]
    print(f"\n🎯 TARGET PROJECT:")
    print(f"   Title: {target_project['title']}")
    print(f"   Language: {target_project['language']}")
    print(f"   Description: {target_project['description'][:100]}...")
    print(f"   Topics: {target_project['topics']}")
    
    recommendations = get_recommendations(project_idx, top_k=3)
    
    print(f"\n🔍 TOP 3 RECOMMENDATIONS:")
    for i, rec in enumerate(recommendations, 1):
        proj = rec['project']
        score = rec['similarity_score']
        print(f"   {i}. {proj['title']} (similarity: {score:.3f})")
        print(f"      Language: {proj['language']} | Topics: {proj['topics']}")
        print(f"      Description: {proj['description'][:80]}...")
        print()
    
    print("-" * 80)


In [None]:
# Performance Evaluation
print("=== PERFORMANCE EVALUATION ===")

def evaluate_language_consistency(num_samples=50):
    """Evaluate how often recommendations match the target project's language"""
    matches = 0
    total_recommendations = 0
    
    # Sample random projects for evaluation
    sample_indices = np.random.choice(len(projects_data), num_samples, replace=False)
    
    for idx in sample_indices:
        target_language = projects_data[idx]['language']
        recommendations = get_recommendations(idx, top_k=5)
        
        for rec in recommendations:
            total_recommendations += 1
            if rec['project']['language'] == target_language:
                matches += 1
    
    return matches / total_recommendations if total_recommendations > 0 else 0

def evaluate_topic_overlap(num_samples=50):
    """Evaluate average topic overlap between target and recommended projects"""
    total_overlap = 0
    total_comparisons = 0
    
    sample_indices = np.random.choice(len(projects_data), num_samples, replace=False)
    
    for idx in sample_indices:
        target_topics = set(projects_data[idx]['topics'].split(',')) if projects_data[idx]['topics'] else set()
        recommendations = get_recommendations(idx, top_k=5)
        
        for rec in recommendations:
            rec_topics = set(rec['project']['topics'].split(',')) if rec['project']['topics'] else set()
            
            if target_topics and rec_topics:
                overlap = len(target_topics.intersection(rec_topics)) / len(target_topics.union(rec_topics))
                total_overlap += overlap
                total_comparisons += 1
    
    return total_overlap / total_comparisons if total_comparisons > 0 else 0

# Run evaluations
print("Evaluating recommendation quality...")

language_consistency = evaluate_language_consistency(num_samples=100)
topic_overlap = evaluate_topic_overlap(num_samples=100)

print(f"\n📊 RECOMMENDATION QUALITY METRICS:")
print(f"   🎯 Language Consistency: {language_consistency:.3f} ({language_consistency*100:.1f}%)")
print(f"   🏷️  Average Topic Overlap: {topic_overlap:.3f} ({topic_overlap*100:.1f}%)")

# Diversity analysis
similarity_threshold = 0.1
diverse_pairs = (similarity_matrix > similarity_threshold).sum() - len(similarity_matrix)  # Exclude diagonal
total_pairs = len(similarity_matrix) * (len(similarity_matrix) - 1)
diversity_score = 1 - (diverse_pairs / total_pairs)

print(f"   🌈 Diversity Score: {diversity_score:.3f} (lower = more diverse)")

# Summary
print(f"\n✅ TRAINING PIPELINE SUMMARY:")
print(f"   📁 Projects processed: {len(projects_data)}")
print(f"   🔧 Feature dimensions: {feature_matrix.shape[1]}")
print(f"   🎯 Similarity matrix: {similarity_matrix.shape}")
print(f"   💾 Model saved: github_recommendations_v1")
print(f"   📊 Language consistency: {language_consistency*100:.1f}%")
print(f"   🏷️  Topic overlap: {topic_overlap*100:.1f}%")
