<a href="https://colab.research.google.com/github/lmmddb/Spotify-Wrapped-/blob/main/main_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CODE FINAL !

In [9]:
"""
SPOTIFY WRAPPED 2025 - ANALYSE COMPL√àTE AUTOMATIS√âE
===================================================
Ce script charge vos donn√©es Spotify, les nettoie, et g√©n√®re tous les tops.
Compatible Google Colab et local.
"""

import os
import json
import pandas as pd
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.colab import drive, files
import ipywidgets as widgets
from IPython.display import display, clear_output

# ==================== CONFIGURATION ====================
LASTFM_API_KEY = '83100e2042c7a645ac9b98747ec29b76'  # Remplacez par votre cl√©
YEAR_TO_ANALYZE = 2025
MIN_SECONDS_PLAYED = 15  # Seuil pour filtrer les skips
TOP_N_ALBUMS = 500  # Nombre de chansons pour r√©cup√©rer les albums

# ==================== 1. CHARGEMENT DES DONN√âES ====================

def load_spotify_files(source='drive', folder_path=None, pattern="StreamingHistory_music_"):
    """Charge les fichiers Spotify depuis Drive ou upload local"""
    dataframes = []

    if source == 'drive':
        drive.mount('/content/drive')
        if folder_path is None:
            folder_path = '/content/drive/MyDrive/Spotify'

        try:
            files_list = [f for f in os.listdir(folder_path)
                         if f.startswith(pattern) and f.endswith(".json")]

            for filename in sorted(files_list):
                file_path = os.path.join(folder_path, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        data = json.load(file)
                        df = pd.json_normalize(data)
                        dataframes.append(df)
                        print(f"‚úì {filename} charg√© ({len(df)} lignes)")
                except json.JSONDecodeError as e:
                    print(f"‚úó Erreur dans {filename}: {e}")

        except FileNotFoundError:
            print(f"‚ö† Dossier '{folder_path}' introuvable")
            return pd.DataFrame()

    elif source == 'upload':
        print("üìÇ S√©lectionnez vos fichiers JSON Spotify...")
        uploaded = files.upload()

        for filename in sorted(uploaded.keys()):
            if filename.startswith(pattern) and filename.endswith(".json"):
                try:
                    with open(filename, 'wb') as f:
                        f.write(uploaded[filename])

                    with open(filename, 'r', encoding='utf-8') as file:
                        data = json.load(file)
                        df = pd.json_normalize(data)
                        dataframes.append(df)
                        print(f"‚úì {filename} charg√© ({len(df)} lignes)")
                except json.JSONDecodeError as e:
                    print(f"‚úó Erreur dans {filename}: {e}")

    if dataframes:
        combined = pd.concat(dataframes, ignore_index=True)
        print(f"\nüìä Total: {len(combined)} enregistrements")
        return combined
    else:
        print("‚ö† Aucun fichier charg√©")
        return pd.DataFrame()


# Interface de chargement
def create_loading_interface():
    """Cr√©e une interface interactive pour charger les donn√©es"""
    global streaming
    streaming = None
    output = widgets.Output()

    source_dropdown = widgets.Dropdown(
        options=[('üìÅ Google Drive', 'drive'), ('üíæ Upload local', 'upload')],
        value='drive',
        description='Source :',
        style={'description_width': 'initial'}
    )

    path_input = widgets.Text(
        value='/content/drive/MyDrive/Spotify',
        description='Chemin Drive :',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

    load_button = widgets.Button(
        description='üì• Charger les donn√©es',
        button_style='success',
        icon='check'
    )

    def on_load_button_clicked(b):
        global streaming
        with output:
            clear_output()
            source = source_dropdown.value

            if source == 'drive':
                streaming = load_spotify_files(source='drive', folder_path=path_input.value)
            else:
                streaming = load_spotify_files(source='upload')

            if not streaming.empty:
                print("\n" + "="*50)
                print("APER√áU DES DONN√âES :")
                print("="*50)
                display(streaming.head())

    load_button.on_click(on_load_button_clicked)

    def on_source_change(change):
        if change['new'] == 'drive':
            path_input.layout.display = 'flex'
        else:
            path_input.layout.display = 'none'

    source_dropdown.observe(on_source_change, names='value')

    interface = widgets.VBox([
        widgets.HTML("<h3>üéµ Chargement des donn√©es Spotify</h3>"),
        source_dropdown,
        path_input,
        load_button,
        output
    ])

    display(interface)


# ==================== 2. NETTOYAGE DES DONN√âES ====================

def clean_data(df, year=YEAR_TO_ANALYZE, min_seconds=MIN_SECONDS_PLAYED):
    """Nettoie et filtre les donn√©es Spotify"""
    print("\nüßπ NETTOYAGE DES DONN√âES")
    print("="*50)

    # Conversion des dates
    df["endTime"] = pd.to_datetime(df["endTime"])

    # Filtrer par ann√©e
    df_year = df[df["endTime"].dt.year == year].copy()
    print(f"‚úì Filtr√© pour l'ann√©e {year}: {len(df_year)} √©coutes")

    # Calculer dur√©e en secondes
    df_year['seconds_played'] = df_year['msPlayed'] / 1000

    # Filtrer les skips
    df_filtered = df_year[df_year['seconds_played'] >= min_seconds].copy()
    print(f"‚úì √âcoutes < {min_seconds}s supprim√©es: {len(df_filtered)} restantes")

    return df_filtered


def detect_and_remove_ambient(df, save_path='/content/drive/MyDrive/Spotify'):
    """D√©tecte et permet de supprimer les sons d'ambiance"""
    print("\nüîç D√âTECTION DES SONS D'AMBIANCE")
    print("="*50)

    keywords = [
        'rain', 'thunder', 'ocean', 'water', 'nature', 'forest', 'bird',
        'white noise', 'pink noise', 'asmr', 'binaural', 'meditation',
        'lofi', 'lo-fi', 'chill beats', 'study beats', 'frequency', 'hz'
    ]

    def detect_ambient(row):
        text = f"{row.get('trackName', '')} {row.get('artistName', '')}".lower()
        return any(keyword in text for keyword in keywords)

    df['is_suspect'] = df.apply(detect_ambient, axis=1)

    suspects = df[df['is_suspect']].groupby(['trackName', 'artistName']).agg({
        'msPlayed': ['count', 'sum']
    }).reset_index()

    suspects.columns = ['trackName', 'artistName', 'nb_ecoutes', 'temps_total_ms']
    suspects['temps_total_min'] = suspects['temps_total_ms'] / 60000
    suspects = suspects.sort_values('nb_ecoutes', ascending=False)

    print(f"üîç {len(suspects)} pistes suspectes d√©tect√©es")

    if len(suspects) > 0:
        display(suspects)

        # Sauvegarder pour r√©vision manuelle
        suspects_file = os.path.join(save_path, 'suspects_to_review.csv')
        suspects.to_csv(suspects_file, index=False)
        print(f"\nüíæ Liste export√©e: {suspects_file}")
        print("üìù Instructions:")
        print("   1. Ouvrez le fichier CSV")
        print("   2. Ajoutez une colonne 'supprimer' avec 'oui' ou 'non'")
        print("   3. Sauvegardez-le sous 'suspects_to_review_edited.csv'")

        # Charger les choix si le fichier existe
        edited_file = os.path.join(save_path, 'suspects_to_review1.csv')
        if os.path.exists(edited_file):
            choix = pd.read_csv(edited_file)

            if 'supprimer' in choix.columns:
                to_remove = choix[choix['supprimer'].str.lower() == 'oui'][['trackName', 'artistName']]

                df['track_id'] = df['trackName'] + '|||' + df['artistName']
                to_remove['track_id'] = to_remove['trackName'] + '|||' + to_remove['artistName']

                before = len(df)
                df = df[~df['track_id'].isin(to_remove['track_id'])].copy()
                after = len(df)

                df.drop(['is_suspect', 'track_id'], axis=1, inplace=True, errors='ignore')

                print(f"\n‚úÖ Suppression effectu√©e:")
                print(f"   Avant: {before} √©coutes")
                print(f"   Apr√®s: {after} √©coutes")
                print(f"   Supprim√©es: {before - after} √©coutes")
            else:
                print("‚ö† Colonne 'supprimer' non trouv√©e dans le fichier √©dit√©")
                df.drop(['is_suspect'], axis=1, inplace=True, errors='ignore')
        else:
            print(f"‚ö† Fichier √©dit√© non trouv√©: {edited_file}")
            df.drop(['is_suspect'], axis=1, inplace=True, errors='ignore')
    else:
        df.drop(['is_suspect'], axis=1, inplace=True, errors='ignore')

    return df


# ==================== 3. R√âCUP√âRATION GENRES & ALBUMS ====================

def get_artist_genres_lastfm(artist_name):
    """R√©cup√®re les genres d'un artiste via Last.fm"""
    url = "http://ws.audioscrobbler.com/2.0/"
    params = {
        'method': 'artist.gettoptags',
        'artist': artist_name,
        'api_key': LASTFM_API_KEY,
        'format': 'json'
    }

    try:
        response = requests.get(url, params=params, timeout=5)
        data = response.json()
        time.sleep(0.2)

        if 'toptags' in data and 'tag' in data['toptags']:
            genres = [tag['name'] for tag in data['toptags']['tag'][:3]]
            return artist_name, genres
        return artist_name, []
    except:
        return artist_name, []


def get_track_album_lastfm(track_name, artist_name):
    """R√©cup√®re l'album avec plusieurs tentatives"""
    url = "http://ws.audioscrobbler.com/2.0/"

    # Tentative 1: track.getInfo
    params1 = {
        'method': 'track.getInfo',
        'artist': artist_name,
        'track': track_name,
        'api_key': LASTFM_API_KEY,
        'format': 'json'
    }
    try:
        response1 = requests.get(url, params=params1, timeout=5)
        data1 = response1.json()
        time.sleep(0.2)

        if 'track' in data1 and 'album' in data1['track']:
            return (track_name, artist_name), data1['track']['album']['title']
    except:
        pass

    # Tentative 2: artist.getTopAlbums
    try:
        params2 = {
            'method': 'artist.gettopalbums',
            'artist': artist_name,
            'api_key': LASTFM_API_KEY,
            'format': 'json',
            'limit': 10
        }

        response2 = requests.get(url, params=params2, timeout=5)
        data2 = response2.json()
        time.sleep(0.2)

        if 'topalbums' in data2 and 'album' in data2['topalbums']:
            albums = data2['topalbums']['album']
            if albums and len(albums) > 0:
                return (track_name, artist_name), albums[0]['name']
    except:
        pass

    return (track_name, artist_name), None


def fetch_genres_and_albums(df, top_n=TOP_N_ALBUMS):
    """R√©cup√®re genres et albums en parall√®le"""
    print("\nüé∏ R√âCUP√âRATION GENRES & ALBUMS")
    print("="*50)

    # Calcul dur√©e moyenne
    df['duration_calculated'] = df.groupby(['trackName', 'artistName'])['msPlayed'].transform('mean')

    # Grouper par chanson
    top_songs = df.groupby(["trackName", "artistName"]).agg({
        'msPlayed': 'sum',
        'duration_calculated': 'first'
    }).reset_index().sort_values('msPlayed', ascending=False)

    # GENRES
    unique_artists = [a for a in df['artistName'].unique() if pd.notna(a)]
    artist_genres_dict = {}

    print(f"üìÄ Traitement de {len(unique_artists)} artistes...")

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(get_artist_genres_lastfm, artist): artist for artist in unique_artists}

        for i, future in enumerate(as_completed(futures)):
            artist_name, genres = future.result()
            artist_genres_dict[artist_name] = genres

            if (i + 1) % 50 == 0:
                print(f"   Progression: {i+1}/{len(unique_artists)}")

    print("‚úì Genres termin√©s!")

    # ALBUMS
    top_songs_subset = top_songs.head(top_n)
    unique_tracks = [(row['trackName'], row['artistName']) for _, row in top_songs_subset.iterrows()
                     if pd.notna(row['trackName']) and pd.notna(row['artistName'])]

    track_album_dict = {}

    print(f"\nüíø Traitement de {len(unique_tracks)} chansons...")

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(get_track_album_lastfm, track, artist): (track, artist)
                   for track, artist in unique_tracks}

        for i, future in enumerate(as_completed(futures)):
            track_artist_key, album_name = future.result()
            track_album_dict[track_artist_key] = album_name

            if (i + 1) % 50 == 0:
                print(f"   Progression: {i+1}/{len(unique_tracks)}")

    print("‚úì Albums termin√©s!")

    # Ajouter aux donn√©es
    top_songs['Genres'] = top_songs['artistName'].map(artist_genres_dict)
    top_songs['Album Name'] = top_songs.apply(
        lambda row: track_album_dict.get((row['trackName'], row['artistName']), None),
        axis=1
    )

    return top_songs


# ==================== 4. G√âN√âRATION DES TOPS ====================

def generate_tops(songs_df):
    """G√©n√®re tous les tops (songs, artists, albums, genres)"""
    print("\nüèÜ G√âN√âRATION DES TOPS")
    print("="*50)

    # Renommer colonnes
    songs_df = songs_df.rename(columns={
        'trackName': 'Song Name',
        'artistName': 'Artist Name',
        'duration_calculated': 'Duration (ms)'
    })

    # Calculer nombre d'√©coutes
    songs_df["No. of times listened"] = songs_df["msPlayed"] / songs_df["Duration (ms)"]

    # Trier
    top_songs_by_listens = songs_df.sort_values("No. of times listened", ascending=False)

    # TOP 10 SONGS
    top_10_songs = top_songs_by_listens[['Song Name', 'Artist Name', 'No. of times listened']].head(10)

    # TOP 10 ARTISTS
    top_artists = top_songs_by_listens.groupby(["Artist Name"])["No. of times listened"].sum().sort_values(ascending=False).reset_index().head(10)

    # TOP 10 ALBUMS
    songs_with_albums = top_songs_by_listens[top_songs_by_listens['Album Name'].notna()].copy()

    if len(songs_with_albums) > 0:
        top_albums = songs_with_albums.groupby(["Album Name", "Artist Name"])["No. of times listened"].sum().sort_values(ascending=False).reset_index().head(10)
    else:
        top_albums = pd.DataFrame(columns=["Album Name", "Artist Name", "No. of times listened"])

    # TOP 10 GENRES
    songs_with_genres = top_songs_by_listens[top_songs_by_listens['Genres'].notna()].copy()
    songs_with_genres['Genres'] = songs_with_genres['Genres'].apply(lambda x: x if isinstance(x, list) else [])

    songs_exploded = songs_with_genres.explode('Genres')
    songs_exploded = songs_exploded[songs_exploded['Genres'].notna()].copy()

    if not songs_exploded.empty:
        songs_exploded['Genres_normalized'] = songs_exploded['Genres'].astype(str).str.lower().str.strip()

        # Mapping genres
        genre_mapping = {
            'rap': 'rap', 'hip hop': 'rap', 'hip-hop': 'rap', 'trap': 'rap',
            'pop': 'pop', 'electropop': 'pop', 'dance pop': 'pop',
            'rock': 'rock', 'alternative rock': 'rock', 'indie rock': 'rock',
            'r&b': 'r&b', 'rnb': 'r&b',
            'electronic': 'electronic', 'edm': 'electronic', 'techno': 'electronic',
            'french': 'french', 'france': 'french',
            'soul': 'soul', 'jazz': 'jazz', 'metal': 'metal'
        }

        songs_exploded['Genres_cleaned'] = songs_exploded['Genres_normalized'].map(lambda x: genre_mapping.get(x, x))

        top_genres = songs_exploded.groupby("Genres_cleaned")["No. of times listened"].sum().sort_values(ascending=False).reset_index().head(10)
        top_genres.columns = ["Genre", "Total Listens"]
        top_genres['Genre'] = top_genres['Genre'].str.title()
    else:
        top_genres = pd.DataFrame(columns=["Genre", "Total Listens"])

    return {
        'songs': top_10_songs,
        'artists': top_artists,
        'albums': top_albums,
        'genres': top_genres,
        'all_data': top_songs_by_listens
    }


def display_results(tops):
    """Affiche tous les r√©sultats"""
    print("\n" + "="*60)
    print("üéµ TOP 10 CHANSONS")
    print("="*60)
    print(tops['songs'].to_string(index=False))

    print("\n" + "="*60)
    print("üé§ TOP 10 ARTISTES")
    print("="*60)
    print(tops['artists'].to_string(index=False))

    print("\n" + "="*60)
    print("üíø TOP 10 ALBUMS")
    print("="*60)
    if not tops['albums'].empty:
        print(tops['albums'].to_string(index=False))
    else:
        print("Aucun album trouv√©")

    print("\n" + "="*60)
    print("üé∏ TOP 10 GENRES")
    print("="*60)
    print(tops['genres'].to_string(index=False))

    # ==================== 5.STASTISTIQUES TEMPORELLES ====================

def calculate_temporal_stats(df):
    """
    Calcule toutes les statistiques temporelles automatiquement

    Args:
        df: DataFrame nettoy√© avec colonnes 'msPlayed' et 'endTime'

    Returns:
        dict: Dictionnaire avec toutes les stats temporelles
    """
    print("\n‚è∞ CALCUL DES STATISTIQUES TEMPORELLES")
    print("="*50)

    # Copie pour ne pas modifier l'original
    streaming = df.copy()

    # 1. TEMPS TOTAL D'√âCOUTE
    total_time_hours = streaming['msPlayed'].sum() / 3600000
    print(f"‚úì Temps total : {total_time_hours:.2f} heures")

    # 2. PLAGE TEMPORELLE
    date_min = streaming['endTime'].min()
    date_max = streaming['endTime'].max()
    total_days = (pd.Timestamp(date_max) - pd.Timestamp(date_min)).days
    print(f"‚úì P√©riode : du {date_min} au {date_max} ({total_days} jours)")

    # 3. MOYENNE QUOTIDIENNE
    avg_ms_per_day = streaming['msPlayed'].sum() / total_days
    avg_mins_per_day = avg_ms_per_day / 60000
    avg_hours_per_day = avg_mins_per_day / 60
    print(f"‚úì Moyenne par jour : {avg_mins_per_day:.2f} minutes ({avg_hours_per_day:.2f} heures)")

    # 4. √âCOUTES MENSUELLES
    streaming['month'] = streaming['endTime'].dt.month
    monthly_listening = streaming.groupby('month')['msPlayed'].sum().reset_index()
    monthly_listening['hours_listened'] = monthly_listening['msPlayed'] / 3600000
    monthly_listening = monthly_listening.sort_values('hours_listened', ascending=False)

    # Noms des mois
    month_names = {
        1: 'Janvier', 2: 'F√©vrier', 3: 'Mars', 4: 'Avril',
        5: 'Mai', 6: 'Juin', 7: 'Juillet', 8: 'Ao√ªt',
        9: 'Septembre', 10: 'Octobre', 11: 'Novembre', 12: 'D√©cembre'
    }
    monthly_listening['month_name'] = monthly_listening['month'].map(month_names)

    top_month = monthly_listening.iloc[0]
    print(f"‚úì Mois le plus actif : {top_month['month_name']} ({top_month['hours_listened']:.2f}h)")

    # 5. PLAGES HORAIRES (Morning/Afternoon/Night)
    streaming['hour'] = streaming['endTime'].dt.hour

    def time_bucket(hour):
        if 3 <= hour < 11:
            return 'Morning'
        elif 11 <= hour < 18:
            return 'Afternoon'
        else:
            return 'Night'

    streaming['time_bucket'] = streaming['hour'].apply(time_bucket)

    daily_listening = streaming.groupby('time_bucket')['msPlayed'].sum().reset_index()
    daily_listening['hours_listened'] = daily_listening['msPlayed'] / 3600000
    daily_listening['percentage'] = (daily_listening['hours_listened'] / total_time_hours) * 100
    daily_listening = daily_listening.sort_values('hours_listened', ascending=False)

    print(f"‚úì Plage pr√©f√©r√©e : {daily_listening.iloc[0]['time_bucket']} ({daily_listening.iloc[0]['percentage']:.1f}%)")

    # 6. √âCOUTES PAR HEURE
    hour_listening = streaming.groupby('hour')['msPlayed'].sum().reset_index()
    hour_listening['hours_listened'] = hour_listening['msPlayed'] / 3600000
    hour_listening = hour_listening.sort_values('hours_listened', ascending=False)

    peak_hour = hour_listening.iloc[0]
    print(f"‚úì Heure de pointe : {int(peak_hour['hour'])}h ({peak_hour['hours_listened']:.2f}h √©cout√©es)")

    # 7. JOUR DE LA SEMAINE
    streaming['day_of_week'] = streaming['endTime'].dt.day_name()
    weekday_listening = streaming.groupby('day_of_week')['msPlayed'].sum().reset_index()
    weekday_listening['hours_listened'] = weekday_listening['msPlayed'] / 3600000

    # Ordonner les jours
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_names_fr = {
        'Monday': 'Lundi', 'Tuesday': 'Mardi', 'Wednesday': 'Mercredi',
        'Thursday': 'Jeudi', 'Friday': 'Vendredi', 'Saturday': 'Samedi', 'Sunday': 'Dimanche'
    }

    weekday_listening['day_order'] = weekday_listening['day_of_week'].apply(lambda x: day_order.index(x))
    weekday_listening = weekday_listening.sort_values('day_order')
    weekday_listening['day_name_fr'] = weekday_listening['day_of_week'].map(day_names_fr)

    # Pr√©parer le r√©sultat final
    result = {
        'total_hours': round(total_time_hours, 2),
        'total_days': total_days,
        'avg_mins_per_day': round(avg_mins_per_day, 2),
        'avg_hours_per_day': round(avg_hours_per_day, 2),
        'date_range': {
            'start': str(date_min),
            'end': str(date_max)
        },
        'monthly': monthly_listening[['month', 'month_name', 'hours_listened']].to_dict('records'),
        'top_month': {
            'name': top_month['month_name'],
            'hours': round(top_month['hours_listened'], 2)
        },
        'time_buckets': daily_listening[['time_bucket', 'hours_listened', 'percentage']].to_dict('records'),
        'top_time_bucket': {
            'name': daily_listening.iloc[0]['time_bucket'],
            'percentage': round(daily_listening.iloc[0]['percentage'], 1)
        },
        'hourly': hour_listening[['hour', 'hours_listened']].to_dict('records'),
        'peak_hour': {
            'hour': int(peak_hour['hour']),
            'hours_listened': round(peak_hour['hours_listened'], 2)
        },
        'weekday': weekday_listening[['day_name_fr', 'hours_listened']].to_dict('records'),
        'top_weekday': {
            'name': weekday_listening.sort_values('hours_listened', ascending=False).iloc[0]['day_name_fr'],
            'hours': round(weekday_listening['hours_listened'].max(), 2)
        }
    }

    print("\n‚úÖ Statistiques temporelles calcul√©es!")
    return result


def display_temporal_stats(stats):
    """Affiche un r√©sum√© des statistiques temporelles"""
    print("\n" + "="*60)
    print("üìä R√âSUM√â DES STATISTIQUES TEMPORELLES")
    print("="*60)

    print(f"\nüéµ TEMPS TOTAL : {stats['total_hours']} heures")
    print(f"üìÖ P√âRIODE : {stats['total_days']} jours")
    print(f"‚è±Ô∏è  MOYENNE QUOTIDIENNE : {stats['avg_mins_per_day']} minutes")

    print(f"\nüèÜ MOIS LE PLUS ACTIF : {stats['top_month']['name']} ({stats['top_month']['hours']}h)")
    print(f"‚òÄÔ∏è MOMENT PR√âF√âR√â : {stats['top_time_bucket']['name']} ({stats['top_time_bucket']['percentage']}%)")
    print(f"üïê HEURE DE POINTE : {stats['peak_hour']['hour']}h")
    print(f"üìÜ JOUR PR√âF√âR√â : {stats['top_weekday']['name']}")

    print("\n" + "="*60)

def save_results_with_temporal(tops, temporal_stats, save_path='/content/drive/MyDrive/Spotify'):
    results = {
        'top_songs': tops['songs'].to_dict('records'),
        'top_artists': tops['artists'].to_dict('records'),
        'top_albums': tops['albums'].to_dict('records'),
        'top_genres': tops['genres'].to_dict('records'),
        'stats': {
            'total_artists': len(tops['all_data']['Artist Name'].unique()),
            'total_songs': len(tops['all_data']),
            'top_genre': tops['genres'].iloc[0]['Genre'] if not tops['genres'].empty else 'Unknown'
        },
        'temporal_stats': temporal_stats
    }

    output_file = os.path.join(save_path, 'spotify_wrapped_data.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\nüíæ R√©sultats sauvegard√©s: {output_file}")
    return output_file

# ==================== EX√âCUTION PRINCIPALE ====================

def main():
    """Fonction principale qui ex√©cute toute l'analyse"""
    print("="*60)
    print("üéµ SPOTIFY WRAPPED 2024 - ANALYSE COMPL√àTE")
    print("="*60)

    # 1. Chargement
    create_loading_interface()

    print("\n‚è≥ Attendez le chargement via l'interface ci-dessus...")
    print("   Puis ex√©cutez la suite du code manuellement.")


# Lancer l'interface
main()

# ==================== PREPARATION DATA (√Ä ex√©cuter apr√®s chargement des donn√©es) ====================
"""
Ex√©cutez chacune des √©tapes apr√®s avoir charg√© les donn√©es via l'interface :

#2. Nettoyage
streaming_clean = clean_data(streaming)
streaming_clean = detect_and_remove_ambient(streaming_clean)

# 3. R√©cup√©ration genres & albums
songs_with_metadata = fetch_genres_and_albums(streaming_clean)

# 4. G√©n√©ration tops
tops = generate_tops(songs_with_metadata)

# 5. Calcul des statistiques temporelles
temporal_stats = calculate_temporal_stats(streaming_clean)

# 6. Affichage
display_results(tops)
display_temporal_stats(temporal_stats)

# 7. Sauvegarde pour Manim
json_file = save_results_with_temporal(tops, temporal_stats)

print("\n‚úÖ ANALYSE TERMIN√âE!")
print(f"üìÑ Fichier g√©n√©r√©: {json_file}")
print("üé¨ Vous pouvez maintenant utiliser ce fichier avec le script Manim")
"""


üíæ R√©sultats sauvegard√©s: /content/drive/MyDrive/Spotify/spotify_wrapped_data.json


In [12]:
# ==================== INSTRUCTIONS D'UTILISATION ====================
"""
√âTAPES POUR G√âN√âRER VOTRE VID√âO:

1. Ex√©cutez d'abord le script d'analyse complet (CODE 1)
2. Assurez-vous que le fichier 'spotify_wrapped_data.json' existe
3. Installez Manim (Cellule 1) - une seule fois
4. Ex√©cutez la Cellule 2 pour charger les donn√©es
5. Ex√©cutez la Cellule 3 (%%manim) pour g√©n√©rer la vid√©o

OPTIONS DE QUALIT√â:
- -ql : Low quality (rapide, pour tester) ~30 secondes
- -qm : Medium quality (recommand√©) ~2-3 minutes
- -qh : High quality (HD) ~5-7 minutes
- -qk : 4K quality (tr√®s lent) ~15-20 minutes

La vid√©o sera sauvegard√©e dans: /content/media/videos/
"""
# =============================================================================
"""
SPOTIFY WRAPPED 2025 - VID√âO MANIM AVEC STATS TEMPORELLES
=========================================================
Ce script charge les donn√©es JSON g√©n√©r√©es par l'analyse
et cr√©e automatiquement la vid√©o Spotify Wrapped.
"""

# ==================== INSTALLATION (√Ä ex√©cuter une seule fois) ====================
"""
Cellule 1: Installation Manim

print("1/4 ‚è≥ Installation des outils syst√®me (Linux)...")
!sudo apt update -qq
!sudo apt install libcairo2-dev ffmpeg texlive texlive-latex-extra texlive-fonts-extra texlive-latex-recommended texlive-science tipa libpango1.0-dev -qq

print("2/4 ‚è≥ Installation de Manim...")
!pip install manim -qq

print("3/4 üîß FOR√áAGE de la compatibilit√© Google Colab...")
# C'est la ligne magique : on oblige Python √† remettre la version 7.34.0
# Cela va "casser" les d√©pendances de Manim en th√©orie, mais √ßa le fera marcher en pratique.
!pip install "ipython==7.34.0" -qq

print("4/4 ‚úÖ Installation termin√©e. Red√©marrage...")
import os
os.kill(os.getpid(), 9)
"""
# ==================== CELLULE 2: IMPORTS ET CHARGEMENT DONN√âES ====================

from manim import *
import json

# Charger les donn√©es (√† adapter selon votre chemin)
DATA_FILE = '/content/drive/MyDrive/Spotify/spotify_wrapped_data.json'

class SpotifyWrappedComplete(Scene):
    def construct(self):
        # Charger les donn√©es
        with open(DATA_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Configuration
        self.camera.background_color = "#191414"

        # ===== INTRO =====
        self.show_intro()

        # ===== STATS GLOBALES =====
        self.show_global_stats(data['stats'])

        # ===== STATS TEMPORELLES =====
        if 'temporal_stats' in data:
            self.show_temporal_overview(data['temporal_stats'])
            self.show_monthly_chart(data['temporal_stats'])
            self.show_daily_pattern(data['temporal_stats'])

        # ===== TOP 5 CHANSONS =====
        self.show_top_songs(data['top_songs'][:5])

        # ===== TOP 5 ARTISTES =====
        self.show_top_artists(data['top_artists'][:5])

        # ===== TOP GENRES =====
        self.show_top_genres(data['top_genres'][:5])

        # ===== OUTRO =====
        self.show_outro()

    # ===== STATS TEMPORELLES =====

    def show_temporal_overview(self, temporal_stats):
        """Affiche les statistiques temporelles globales"""
        title = Text("Ton Ann√©e en Musique", font_size=50, color="#1DB954")
        title.to_edge(UP, buff=0.5)
        self.play(Write(title))

        stats_boxes = VGroup()

        # Temps total
        box1 = self.create_stat_box(
            f"{temporal_stats['total_hours']}h",
            "Temps Total d'√âcoute",
            "#1DB954"
        )

        # Moyenne quotidienne
        box2 = self.create_stat_box(
            f"{temporal_stats['avg_mins_per_day']} min",
            "Moyenne par Jour",
            "#1ED760"
        )

        # Mois pr√©f√©r√©
        box3 = self.create_stat_box(
            temporal_stats['top_month']['name'],
            "Mois le Plus Actif",
            "#1FDF64"
        )

        stats_boxes.add(box1, box2, box3)
        stats_boxes.arrange(RIGHT, buff=0.5)
        stats_boxes.next_to(title, DOWN, buff=0.8)

        for box in stats_boxes:
            self.play(FadeIn(box, scale=0.8), run_time=0.5)

        self.wait(2)
        self.play(FadeOut(title), FadeOut(stats_boxes))

    def show_monthly_chart(self, temporal_stats):
        """Affiche le graphique des √©coutes mensuelles"""
        title = Text("√âcoutes par Mois", font_size=50, color="#1DB954")
        title.to_edge(UP, buff=0.5)
        self.play(Write(title))

        # Cr√©er le graphique en barres
        monthly_data = temporal_stats['monthly'][:12]  # Max 12 mois

        bars = VGroup()
        max_hours = max([m['hours_listened'] for m in monthly_data])

        for i, month in enumerate(monthly_data):
            # Barre
            bar_height = (month['hours_listened'] / max_hours) * 3
            bar = Rectangle(
                width=0.5,
                height=bar_height,
                fill_color="#1DB954",
                fill_opacity=0.8,
                stroke_width=0
            )

            # Label du mois (3 premi√®res lettres)
            month_label = Text(month['month_name'][:3], font_size=16, color=WHITE)
            month_label.next_to(bar, DOWN, buff=0.1)

            # Valeur en heures
            hours_text = Text(f"{month['hours_listened']:.0f}h", font_size=14, color=WHITE)
            hours_text.next_to(bar, UP, buff=0.1)

            month_group = VGroup(bar, month_label, hours_text)
            bars.add(month_group)

        bars.arrange(RIGHT, buff=0.3)
        bars.move_to(ORIGIN).shift(DOWN * 0.3)

        # Animer les barres qui apparaissent une par une
        for bar_group in bars:
            self.play(GrowFromEdge(bar_group[0], DOWN), run_time=0.3)
            self.play(FadeIn(bar_group[1]), FadeIn(bar_group[2]), run_time=0.2)

        self.wait(2)
        self.play(FadeOut(title), FadeOut(bars))

    def show_daily_pattern(self, temporal_stats):
        """Affiche les habitudes d'√©coute quotidiennes"""
        title = Text("Tes Moments Pr√©f√©r√©s", font_size=50, color="#1DB954")
        title.to_edge(UP, buff=0.5)
        self.play(Write(title))

        # Pr√©parer les donn√©es des plages horaires
        time_buckets = temporal_stats['time_buckets']

        # Mapping fran√ßais
        bucket_names = {
            'Morning': 'üåÖ Matin',
            'Afternoon': '‚òÄÔ∏è Apr√®s-midi',
            'Night': 'üåô Soir/Nuit'
        }

        # Trier par pourcentage
        time_buckets_sorted = sorted(time_buckets, key=lambda x: x['percentage'], reverse=True)

        buckets_display = VGroup()

        for i, bucket in enumerate(time_buckets_sorted):
            # Cercle de pourcentage
            circle = Circle(
                radius=0.8,
                fill_color="#1DB954",
                fill_opacity=0.2,
                stroke_color="#1DB954",
                stroke_width=4
            )

            # Pourcentage au centre
            pct_text = Text(f"{bucket['percentage']:.0f}%", font_size=36, color=WHITE)
            pct_text.move_to(circle.get_center())

            # Label en dessous
            label = Text(
                bucket_names.get(bucket['time_bucket'], bucket['time_bucket']),
                font_size=24,
                color=GRAY
            )
            label.next_to(circle, DOWN, buff=0.3)

            bucket_group = VGroup(circle, pct_text, label)
            buckets_display.add(bucket_group)

        buckets_display.arrange(RIGHT, buff=1.2)
        buckets_display.move_to(ORIGIN)

        for bucket in buckets_display:
            self.play(FadeIn(bucket, scale=0.7), run_time=0.6)

        self.wait(2)
        self.play(FadeOut(title), FadeOut(buckets_display))

    # ===== TOPS + GENRES =====

    def show_intro(self):
        """Animation d'intro"""
        title = Text("My Spotify", font_size=72, color=WHITE)
        wrapped = Text("Wrapped 2025", font_size=72, color="#1DB954")
        title_group = VGroup(title, wrapped).arrange(DOWN, buff=0.3)

        self.play(Write(title, run_time=1.5), rate_func=smooth)
        self.play(FadeIn(wrapped, shift=UP), run_time=1)
        self.wait(1)

        self.play(FadeOut(title_group))

    def show_global_stats(self, stats):
        """Affiche les statistiques globales"""
        stats_title = Text("Statistiques Globales", font_size=48, color="#1DB954")
        stats_title.to_edge(UP, buff=0.5)
        self.play(Write(stats_title))

        stat_boxes = VGroup()

        box1 = self.create_stat_box(
            str(stats['total_artists']),
            "Artistes Uniques",
            "#1DB954"
        )

        box2 = self.create_stat_box(
            str(stats['total_songs']),
            "Chansons √âcout√©es",
            "#1ED760"
        )

        box3 = self.create_stat_box(
            stats['top_genre'],
            "Genre Favori",
            "#1FDF64"
        )

        stat_boxes.add(box1, box2, box3)
        stat_boxes.arrange(RIGHT, buff=0.5)
        stat_boxes.next_to(stats_title, DOWN, buff=0.8)

        for box in stat_boxes:
            self.play(FadeIn(box, scale=0.8), run_time=0.5)

        self.wait(2)
        self.play(FadeOut(stats_title), FadeOut(stat_boxes))

    def show_top_songs(self, songs):
        """Affiche le top 5 chansons"""
        songs_title = Text("Top 5 Chansons", font_size=50, color="#1DB954")
        songs_title.to_edge(UP, buff=0.5)
        self.play(Write(songs_title))

        songs_group = VGroup()

        for i, song in enumerate(songs):
            rank = str(i + 1)
            title = song['Song Name']
            artist = song['Artist Name']
            plays = str(int(song['No. of times listened']))

            song_entry = self.create_song_entry(rank, title, artist, plays)
            song_entry.shift(DOWN * (i * 1.2 - 2))
            songs_group.add(song_entry)

        songs_group.move_to(ORIGIN).shift(DOWN * 0.5)

        for entry in songs_group:
            self.play(FadeIn(entry, shift=RIGHT), run_time=0.7)

        self.wait(3)
        self.play(FadeOut(songs_title), FadeOut(songs_group))

    def show_top_artists(self, artists):
        """Affiche le top 5 artistes"""
        artists_title = Text("Top 5 Artistes", font_size=50, color="#1DB954")
        artists_title.to_edge(UP, buff=0.5)
        self.play(Write(artists_title))

        artists_data = [
            (str(i + 1), artist['Artist Name'], str(int(artist['No. of times listened'])))
            for i, artist in enumerate(artists)
        ]

        artists_bars = self.create_bar_chart(artists_data, "√©coutes")
        artists_bars.next_to(artists_title, DOWN, buff=1)

        self.play(FadeIn(artists_bars))
        self.wait(3)
        self.play(FadeOut(artists_title), FadeOut(artists_bars))

    def show_top_genres(self, genres):
        """Affiche le top genres avec pie chart"""
        genres_title = Text("Top Genres", font_size=50, color="#1DB954")
        genres_title.to_edge(UP, buff=0.5)
        self.play(Write(genres_title))

        colors = ["#1DB954", "#1ED760", "#1FDF64", "#509952", "#7DAA5E"]
        genre_data = [
            (genre['Genre'], int(genre['Total Listens']), colors[i])
            for i, genre in enumerate(genres)
        ]

        pie_chart = self.create_pie_chart(genre_data)
        pie_chart.move_to(ORIGIN).shift(LEFT * 2)

        self.play(Create(pie_chart), run_time=2)

        legends = self.create_legends(genre_data)
        legends.next_to(pie_chart, RIGHT, buff=1)
        self.play(FadeIn(legends))

        self.wait(3)
        self.play(FadeOut(genres_title), FadeOut(pie_chart), FadeOut(legends))

    def show_outro(self):
        """Animation de fin"""
        outro_text = Text("Merci pour cette ann√©e musicale ! üéµ", font_size=40, color="#1DB954")
        self.play(Write(outro_text))
        self.wait(2)
        self.play(FadeOut(outro_text))

    # ===== HELPER FUNCTIONS =====

    def create_stat_box(self, value, label, color):
        """Cr√©e une bo√Æte de statistique"""
        box = RoundedRectangle(
            corner_radius=0.2,
            width=3.5,
            height=2,
            fill_color=color,
            fill_opacity=0.2,
            stroke_color=color,
            stroke_width=3
        )

        value_str = str(value)
        if len(value_str) > 10:
            value_str = value_str[:10] + "..."

        value_text = Text(value_str, font_size=40, color=WHITE)
        value_text.move_to(box.get_center())

        label_text = Text(str(label), font_size=18, color=GRAY)
        label_text.next_to(value_text, DOWN, buff=0.2)

        return VGroup(box, value_text, label_text)

    def create_song_entry(self, rank, song, artist, plays):
        """Cr√©e une entr√©e de chanson"""
        song_str = song if len(song) <= 25 else song[:25] + "..."
        artist_str = artist if len(artist) <= 20 else artist[:20] + "..."

        rank_text = Text(str(rank), font_size=36, color="#1DB954")
        song_text = Text(song_str, font_size=24, color=WHITE)
        artist_text = Text(artist_str, font_size=18, color=GRAY)

        song_artist = VGroup(song_text, artist_text).arrange(DOWN, buff=0.1, aligned_edge=LEFT)

        max_plays = 120
        bar_width = min((float(plays) / max_plays) * 4, 4)
        bar = Rectangle(
            width=bar_width,
            height=0.3,
            fill_color="#1DB954",
            fill_opacity=0.8,
            stroke_width=0
        )

        plays_text = Text(str(plays), font_size=20, color=WHITE)

        full_entry = VGroup(rank_text, song_artist, bar, plays_text).arrange(RIGHT, buff=0.5)
        bar.align_to(song_artist, DOWN).shift(UP * 0.1)

        return full_entry

    def create_bar_chart(self, data, unit):
        """Cr√©e un graphique en barres"""
        bars = VGroup()
        max_value = float(data[0][2])

        for i, (rank, name, value) in enumerate(data):
            name_str = name if len(name) <= 15 else name[:15] + "..."
            name_text = Text(name_str, font_size=22, color=WHITE)

            if name_text.width > 2.5:
                name_text.scale(2.5 / name_text.width)

            bar_width = (float(value) / max_value) * 6
            bar = Rectangle(
                width=bar_width,
                height=0.4,
                fill_color="#1DB954",
                fill_opacity=0.7,
                stroke_width=0
            )

            value_text = Text(f"{value} {unit}", font_size=18, color=WHITE)

            line = VGroup(name_text, bar, value_text).arrange(RIGHT, buff=0.3)
            bars.add(line)

        bars.arrange(DOWN, buff=0.4, aligned_edge=LEFT)
        return bars

    def create_pie_chart(self, data):
        """Cr√©e un pie chart"""
        total = sum([d[1] for d in data])
        angles = [(d[1] / total) * TAU for d in data]
        sectors = VGroup()
        start_angle = 90 * DEGREES

        for i, (name, value, color) in enumerate(data):
            sector = AnnularSector(
                inner_radius=0,
                outer_radius=2,
                angle=angles[i],
                start_angle=start_angle,
                fill_color=color,
                fill_opacity=0.8,
                stroke_color=WHITE,
                stroke_width=2
            )
            sectors.add(sector)
            start_angle += angles[i]

        return sectors

    def create_legends(self, data):
        """Cr√©e les l√©gendes"""
        legends = VGroup()

        for i, (name, value, color) in enumerate(data):
            square = Square(side_length=0.3, fill_color=color, fill_opacity=0.8, stroke_width=0)

            name_str = name if len(name) <= 12 else name[:12] + "..."
            text = Text(f"{name_str}: {value}", font_size=18, color=WHITE)

            legend = VGroup(square, text).arrange(RIGHT, buff=0.2)
            legends.add(legend)

        legends.arrange(DOWN, buff=0.3, aligned_edge=LEFT)
        return legends


print("\n‚úÖ Classe Manim d√©finie avec succ√®s!")
print("üìπ √âTAPE SUIVANTE: Dans une NOUVELLE cellule, ex√©cutez:")
print("%%manim -qm -v WARNING SpotifyWrappedComplete")


‚úÖ Classe Manim d√©finie avec succ√®s!
üìπ √âTAPE SUIVANTE: Dans une NOUVELLE cellule, ex√©cutez:


In [8]:
# ==================== CELLULE 3: CLASSE MANIM ====================
%%manim -qm -v WARNING SpotifyWrappedComplete







