In [None]:
# Cell 1: Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Cell 2: Load Sample Data
scenes_data = {
    'description': [
        'A tense interrogation in a dark room',
        'Two lovers walking on a beach at sunset',
        'A high-speed car chase through city streets',
        'A quiet conversation in a coffee shop'
    ],
    'scene_type': ['dialogue', 'romantic', 'action', 'dialogue'],
    'shot_count': [4, 5, 8, 3]
}

df = pd.DataFrame(scenes_data)
print(df)

# Cell 3: Text Vectorization
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(df['description'])

print("Feature names:", vectorizer.get_feature_names_out()[:10])
print("Vector shape:", X.shape)

# Cell 4: Scene Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x=range(len(df)), y='shot_count', hue='cluster', s=200)
plt.title('Scene Clusters by Shot Count')
plt.xlabel('Scene Index')
plt.ylabel('Number of Shots')
plt.show()

# Cell 5: Feature Importance Analysis
feature_importance = np.abs(kmeans.cluster_centers_).mean(axis=0)
feature_names = vectorizer.get_feature_names_out()

top_features = sorted(zip(feature_names, feature_importance), 
                     key=lambda x: x[1], reverse=True)[:10]

print("\nTop 10 Most Important Features:")
for feature, importance in top_features:
    print(f"{feature}: {importance:.4f}")

# Cell 6: Visualization
plt.figure(figsize=(12, 6))
features, importances = zip(*top_features)
plt.barh(features, importances)
plt.xlabel('Importance Score')
plt.title('Top Features for Scene Analysis')
plt.tight_layout()
plt.show()

# Cell 7: Save Model
import joblib

# Save vectorizer and model
joblib.dump(vectorizer, '../models/vectorizer.pkl')
joblib.dump(kmeans, '../models/scene_classifier.pkl')
print("Models saved successfully!")