In [ ]:
"""
Movie Knowledge Graph - Interactive Exploration Script

This script provides hands-on exploration of our movie knowledge graph 
using both RDF/SPARQL and Neo4j/Cypher approaches.

Learning Objectives:
1. Compare RDF vs Property Graph models
2. Execute queries interactively
3. Visualize graph structures
4. Analyze movie relationships
5. Build recommendation systems
"""

import sys
import os
sys.path.append('../src')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from rdf_builder import MovieRDFBuilder
from neo4j_loader import MovieNeo4jLoader
from analytics import MovieAnalytics
from queries.sparql_examples import MovieSPARQLQueries
from queries.cypher_examples import MovieCypherQueries

# Configure plotting
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)

print("Movie Knowledge Graph Explorer")
print("=" * 40)

# Section 1: Data Loading & Overview
print("\n1. DATA LOADING & OVERVIEW")
print("-" * 30)

# Load raw data
df = pd.read_csv('../data/raw/movies.csv')

print(f"Dataset Overview:")
print(f"   Movies: {len(df)}")
print(f"   Directors: {df['director'].nunique()}")
print(f"   Year range: {df['year'].min()} - {df['year'].max()}")
print(f"   Rating range: {df['rating'].min()} - {df['rating'].max()}")

print("\nSample Data:")
print(df.head())

print("\nStatistics:")
print(df.describe())

# Section 2: Build Knowledge Graphs
print("\n\n2. BUILD KNOWLEDGE GRAPHS")
print("-" * 30)

# Build RDF Knowledge Graph
print("Building RDF Knowledge Graph...")
rdf_builder = MovieRDFBuilder()
rdf_builder.load_from_csv('../data/raw/movies.csv')

rdf_stats = rdf_builder.get_statistics()
print("\nRDF Graph Statistics:")
for key, value in rdf_stats.items():
    print(f"   {key}: {value}")

# Build Neo4j Property Graph
print("\nBuilding Neo4j Property Graph...")
neo4j_loader = MovieNeo4jLoader()
neo4j_loader.clear_database()
neo4j_loader.load_from_csv('../data/raw/movies.csv')

neo4j_stats = neo4j_loader.get_statistics()
print("\nNeo4j Graph Statistics:")
for key, value in neo4j_stats.items():
    print(f"   {key}: {value}")

# Section 3: Query Comparison
print("\n\n3. QUERY COMPARISON: SPARQL vs CYPHER")
print("-" * 50)

# Initialize query engines
sparql_queries = MovieSPARQLQueries(rdf_builder.graph)
cypher_queries = MovieCypherQueries(neo4j_loader.driver)

print("Query Comparison: Find all movies")
print("=" * 50)

# SPARQL approach
print("\nSPARQL Results:")
sparql_movies = sparql_queries.find_all_movies()
sparql_df = pd.DataFrame(sparql_movies)
print(sparql_df)

# Cypher approach
print("\nCypher Results:")
cypher_movies = cypher_queries.find_all_movies()
cypher_df = pd.DataFrame(cypher_movies)
print(cypher_df)

# Section 4: Genre Analysis
print("\n\n4. GENRE ANALYSIS")
print("-" * 30)

# Analyze genre distribution
print("Genre Analysis")
print("=" * 30)

# Extract all genres from the data
all_genres = []
for genres_str in df['genres']:
    if pd.notna(genres_str):
        all_genres.extend(genres_str.split('|'))

genre_counts = pd.Series(all_genres).value_counts()

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot
genre_counts.plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Movies by Genre')
ax1.set_xlabel('Genre')
ax1.set_ylabel('Number of Movies')
ax1.tick_params(axis='x', rotation=45)

# Pie chart
genre_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%')
ax2.set_title('Genre Distribution')
ax2.set_ylabel('')

plt.tight_layout()
plt.show()

print(f"\nGenre Statistics:")
print(f"   Total unique genres: {len(genre_counts)}")
print(f"   Most common genre: {genre_counts.index[0]} ({genre_counts.iloc[0]} movies)")
print(f"   Average movies per genre: {genre_counts.mean():.1f}")

# Section 5: Director Analysis
print("\n\n5. DIRECTOR ANALYSIS")
print("-" * 30)

# Director productivity and ratings
print("Director Analysis")
print("=" * 30)

director_stats = df.groupby('director').agg({
    'title': 'count',
    'rating': ['mean', 'max', 'min'],
    'year': ['min', 'max']
}).round(2)

director_stats.columns = ['movie_count', 'avg_rating', 'max_rating', 'min_rating', 'first_year', 'last_year']
director_stats = director_stats.sort_values('movie_count', ascending=False)

print("Director Statistics:")
print(director_stats)

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Movies per director
director_stats['movie_count'].plot(kind='bar', ax=ax1, color='lightcoral')
ax1.set_title('Movies per Director')
ax1.set_xlabel('Director')
ax1.set_ylabel('Number of Movies')
ax1.tick_params(axis='x', rotation=45)

# Average rating per director
director_stats['avg_rating'].plot(kind='bar', ax=ax2, color='lightgreen')
ax2.set_title('Average Rating by Director')
ax2.set_xlabel('Director')
ax2.set_ylabel('Average Rating')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Section 6: Graph Visualization
print("\n\n6. GRAPH VISUALIZATION")
print("-" * 30)

# Create NetworkX graph for visualization
print("Graph Visualization")
print("=" * 30)

# Build NetworkX graph
G = nx.Graph()

# Add nodes and edges
for _, row in df.iterrows():
    movie_node = f"Movie: {row['title']}"
    director_node = f"Director: {row['director']}"
    
    # Add nodes
    G.add_node(movie_node, type='movie', year=row['year'], rating=row['rating'])
    G.add_node(director_node, type='director')
    
    # Add edge
    G.add_edge(movie_node, director_node, relationship='directed_by')
    
    # Add genre nodes
    if pd.notna(row['genres']):
        for genre in row['genres'].split('|'):
            genre_node = f"Genre: {genre}"
            G.add_node(genre_node, type='genre')
            G.add_edge(movie_node, genre_node, relationship='has_genre')

print(f"NetworkX Graph:")
print(f"   Nodes: {G.number_of_nodes()}")
print(f"   Edges: {G.number_of_edges()}")

# Visualization
plt.figure(figsize=(16, 12))

# Create layout
pos = nx.spring_layout(G, k=3, iterations=50)

# Color nodes by type
node_colors = []
for node in G.nodes():
    if node.startswith('Movie:'):
        node_colors.append('lightblue')
    elif node.startswith('Director:'):
        node_colors.append('lightcoral')
    else:  # Genre
        node_colors.append('lightgreen')

# Draw graph
nx.draw(G, pos, 
        node_color=node_colors,
        node_size=1000,
        font_size=8,
        font_weight='bold',
        with_labels=True,
        edge_color='gray',
        alpha=0.7)

plt.title('Movie Knowledge Graph Visualization', size=16, weight='bold')

# Add legend
import matplotlib.patches as mpatches
movie_patch = mpatches.Patch(color='lightblue', label='Movies')
director_patch = mpatches.Patch(color='lightcoral', label='Directors')
genre_patch = mpatches.Patch(color='lightgreen', label='Genres')
plt.legend(handles=[movie_patch, director_patch, genre_patch], loc='upper right')

plt.tight_layout()
plt.show()

# Section 7: Recommendation System
print("\n\n7. RECOMMENDATION SYSTEM")
print("-" * 40)

# Simple recommendation system
print("Movie Recommendation System")
print("=" * 40)

def recommend_movies_cypher(movie_title, limit=3):
    """Get recommendations using Cypher"""
    return cypher_queries.find_movie_recommendations(movie_title, limit)

# Test recommendations
test_movie = "The Matrix"
print(f"\nMovies similar to '{test_movie}':")

recommendations = recommend_movies_cypher(test_movie)
if recommendations:
    for i, rec in enumerate(recommendations, 1):
        print(f"   {i}. {rec['title']} ({rec['year']}) - Rating: {rec['rating']}")
        print(f"      Director: {rec['director']}, Shared genres: {rec['shared_genres']}")
else:
    print(f"   No recommendations found for '{test_movie}'")

# Try another movie
test_movie2 = "Inception"
print(f"\nMovies similar to '{test_movie2}':")

recommendations2 = recommend_movies_cypher(test_movie2)
if recommendations2:
    for i, rec in enumerate(recommendations2, 1):
        print(f"   {i}. {rec['title']} ({rec['year']}) - Rating: {rec['rating']}")
        print(f"      Director: {rec['director']}, Shared genres: {rec['shared_genres']}")

# Section 8: Advanced Analytics
print("\n\n8. ADVANCED ANALYTICS")
print("-" * 40)

# Advanced graph analytics
print("Advanced Graph Analytics")
print("=" * 40)

# Genre co-occurrence analysis
print("\nGenre Co-occurrence:")
genre_cooccurrence = cypher_queries.find_genre_network()

if genre_cooccurrence:
    cooccurrence_df = pd.DataFrame(genre_cooccurrence)
    print("\nGenres that appear together:")
    print(cooccurrence_df[['genre1', 'genre2', 'movies_in_common']])

# Director collaboration analysis
print("\nDirector Analysis:")
prolific_directors = cypher_queries.find_prolific_directors(min_movies=1)

if prolific_directors:
    directors_df = pd.DataFrame(prolific_directors)
    print("\nDirector productivity:")
    print(directors_df[['director', 'movie_count', 'average_rating']])

# Decade analysis
print("\nDecade Trends:")
decade_trends = cypher_queries.analyze_decade_trends()

if decade_trends:
    decades_df = pd.DataFrame(decade_trends)
    print("\nMovies by decade:")
    print(decades_df)

# Section 9: Interactive Query Playground
print("\n\n9. INTERACTIVE QUERY PLAYGROUND")
print("-" * 40)

# Interactive query playground
print("Interactive Query Playground")
print("=" * 40)

# Custom SPARQL query
print("\nCustom SPARQL Query:")
custom_sparql = """
PREFIX movie: <http://movie-kg.org/ontology#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

SELECT ?director_name (COUNT(?movie) as ?movie_count) (AVG(?rating) as ?avg_rating)
WHERE {
    ?movie a movie:Movie .
    ?movie movie:hasRating ?rating .
    ?movie movie:directedBy ?director .
    ?director foaf:name ?director_name .
}
GROUP BY ?director_name
ORDER BY DESC(?movie_count)
"""

try:
    results = list(rdf_builder.graph.query(custom_sparql))
    print("Results:")
    for row in results:
        director = str(row[0])
        count = int(row[1])
        avg_rating = float(row[2])
        print(f"   {director}: {count} movies, avg rating: {avg_rating:.1f}")
except Exception as e:
    print(f"Error: {e}")

# Custom Cypher query
print("\nCustom Cypher Query:")
custom_cypher = """
MATCH (m:Movie)-[:HAS_GENRE]->(g:Genre)
WITH g.name as genre, collect(m.title) as movies, avg(m.rating) as avg_rating
RETURN genre, size(movies) as movie_count, round(avg_rating, 1) as avg_rating, movies
ORDER BY movie_count DESC
"""

try:
    with neo4j_loader.driver.session() as session:
        result = session.run(custom_cypher)
        print("Results:")
        for record in result:
            genre = record['genre']
            count = record['movie_count']
            avg_rating = record['avg_rating']
            movies = record['movies']
            print(f"   {genre}: {count} movies, avg rating: {avg_rating}")
            print(f"      Movies: {', '.join(movies[:3])}{'...' if len(movies) > 3 else ''}")
except Exception as e:
    print(f"Error: {e}")

# Section 10: Cleanup
print("\n\n10. CLEANUP")
print("-" * 20)

# Cleanup resources
print("Cleaning up resources...")

if neo4j_loader:
    neo4j_loader.close()
    print("   Neo4j connection closed")

print("\nExploration complete!")
print("\nWhat you've learned:")
print("   • RDF triple-based knowledge representation")
print("   • Neo4j property graph modeling")
print("   • SPARQL semantic queries")
print("   • Cypher graph traversal")
print("   • Knowledge graph analytics")
print("   • Graph-based recommendations")
print("\nNext steps:")
print("   • Add more entities (actors, studios)")
print("   • Implement advanced algorithms")
print("   • Connect to external APIs")
print("   • Build web interfaces")

print("\n" + "=" * 50)
print("EXPLORATION SCRIPT COMPLETED")
print("=" * 50)