In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
try:
    from xgboost import XGBClassifier
except ImportError:
    XGBClassifier = None


In [47]:
# --- TASK 1: DATA CLEANING ---
def load_and_clean(path):
    # Load dataset
    df = pd.read_csv(path)

    # Deduplicate songs
    df = df.drop_duplicates(subset=['artist_name', 'track_name'])

    # Drop irrelevant columns
    # key/mode/time_sig are too technical; popularity is social, not acoustic.
    cols_to_drop = ['track_id', 'key', 'mode', 'time_signature', 'duration_ms', 'popularity']
    df = df.drop(columns=cols_to_drop)

    return df.dropna()


In [48]:
# --- TASK 3: FEATURE ENGINEERING (MOOD LABELS) ---
def assign_mood(row):
    v, e, t, a = row['valence'], row['energy'], row['tempo'], row['acousticness']

    if e > 0.7 and t > 130: return 'Energetic'
    if v > 0.7 and e > 0.5: return 'Happy'
    if v < 0.3 and e < 0.3: return 'Sad'
    if a > 0.7 and e < 0.3: return 'Calm'
    if a > 0.5 and v > 0.4 and t < 110: return 'Romantic'
    return 'Chill' # Balanced middle-ground


In [49]:
# --- TASK 7: RECOMMENDATION LOGIC ---
def recommend_songs(df, mood_input, platform='reels', top_n=5):
    # Filter by predicted mood
    filtered = df[df['mood'] == mood_input].copy()

    if platform == 'reels':
        # High energy and tempo for fast-paced content
        filtered = filtered.sort_values(by=['energy', 'tempo'], ascending=False)
    elif platform == 'posts':
        # Good danceability for general aesthetic vibes
        filtered = filtered.sort_values(by='danceability', ascending=False)
    elif platform == 'stories':
        # Higher acousticness for intimate/calm updates
        filtered = filtered.sort_values(by='acousticness', ascending=False)

    return filtered[['artist_name', 'track_name']].head(top_n)


In [50]:
# --- MAIN EXECUTION PIPELINE ---
# Note: Using mock path for demonstration; replace with 'SpotifyFeatures.csv'
def run_pipeline(df):
    # 1. Label Generation
    df['mood'] = df.apply(assign_mood, axis=1)

    # 2. Preprocessing
    features = ['energy', 'valence', 'danceability', 'tempo', 'acousticness', 'loudness']
    X = df[features]
    y = df['mood']

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 3. Model Training & Selection
    # Using Random Forest as it handles non-linear mood boundaries effectively
    rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
    rf.fit(X_train_scaled, y_train)

    # 4. Evaluation
    y_pred = rf.predict(X_test_scaled)
    print(f"Model Accuracy: {accuracy_score(y_test, y_pred):.2%}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

    return rf, scaler, le

# Visualizing Mood Distribution
# ![Mood Distribution](image_png_0_20260102_054859_264647.png)

# Feature Importance Insights
# ![Feature Importance](image_png_0_20260102_054859_366938.png)

# Load and clean the data
df_spotify = load_and_clean('/content/SpotifyAudioFeaturesApril2019.csv')

# Run the pipeline
model, scaler, label_encoder = run_pipeline(df_spotify)


Model Accuracy: 99.98%

Classification Report:
               precision    recall  f1-score   support

        Calm       1.00      1.00      1.00      1279
       Chill       1.00      1.00      1.00     14303
   Energetic       1.00      1.00      1.00      3654
       Happy       1.00      1.00      1.00      3052
    Romantic       1.00      0.99      1.00       858
         Sad       1.00      1.00      1.00      2987

    accuracy                           1.00     26133
   macro avg       1.00      1.00      1.00     26133
weighted avg       1.00      1.00      1.00     26133



In [None]:
import numpy as np

# List of features we need from the user
feature_names = ['energy', 'valence', 'danceability', 'tempo', 'acousticness', 'loudness']

# Take user input all at once (space-separated) or one by one
print("Enter the following 6 features separated by space:")
print("Features order:", ", ".join(feature_names))
user_input = input("Enter values: ")

# Convert input string to list of floats
user_values = [float(x) for x in user_input.strip().split()]

# Check if correct number of features entered
if len(user_values) != len(feature_names):
    print(f"Error: Please enter exactly {len(feature_names)} values.")
else:
    # Convert to numpy array and reshape
    input_array = np.array(user_values).reshape(1, -1)

    # Scale using the same scaler used during training
    input_array_scaled = scaler.transform(input_array)

    # Predict mood
    mood_encoded = model.predict(input_array_scaled)
    mood = label_encoder.inverse_transform(mood_encoded)[0]

    print(f"\nPredicted Mood: {mood}")

    # Recommend top 5 songs for this mood
    print("\nTop 5 Recommended Songs for this Mood:")
    recommendations = recommend_songs(df_spotify, mood_input=mood, platform='reels', top_n=5)
    for idx, row in recommendations.iterrows():
        print(f"{row['track_name']} by {row['artist_name']}")



Enter the following 6 features separated by space:
Features order: energy, valence, danceability, tempo, acousticness, loudness
