In [None]:
# Version 1


import requests
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import praw
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from textblob import TextBlob
from pytrends.request import TrendReq
import os
from scipy.stats import ttest_ind
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)


KEYWORDS = [
    "drama", "cancelled", "racist", "assault", "lawsuit", "controversy", "apology",
    "accused", "kill", "murder", "over", "cooked", "horrible", "trash",
    "criticism", "disrespect", "problematic", "backlash", "allegation", "scandal"
]



with open("config.json") as f:
    keys = json.load(f)

spotify = spotipy.Spotify(
    auth_manager = SpotifyClientCredentials(
        client_id = keys["SPOTIFY_CLIENT_ID"],
        client_secret = keys["SPOTIFY_CLIENT_SECRET"]
    )
)

reddit = praw.Reddit(
    client_id = keys["REDDIT_CLIENT_ID"],
    client_secret = keys["REDDIT_CLIENT_SECRET"],
    user_agent = "music_sentiment_bot by u/Many-Lingonberry1688"
)

print(keys.keys())

SOUNDCHARTS_KEY = keys["SOUNDCHARTS_API_KEY"]
BASE_URL = "https://api.soundcharts.com/api/v2" 

headers = {
    "Authorization": f"Bearer {SOUNDCHARTS_KEY}"
}

def get_artist_id(artist_name):
    try:
        url = f"{BASE_URL}/artist/search" 
        parameters = {"q": artist_name}
        response = requests.get(url, headers = headers, params = parameters)
        data = response.json()

        if not data or "items" not in data or len(data["items"]) == 0:
            print(f"{artist_name} not found in soundcharts")
            return None
        
        return data["items"][0]["id"]
    
    except Exception as e:
        print(f"error getting {artist_name}: {e}")
        return None
    
def historical_listener_data(artist_name, start_date, end_date):

    artist_id = get_artist_id(artist_name)
    if not artist_id:
        return None
    
    try:
        url = f"{BASE_URL}/artist/{artist_id}/audience/streaming"
        parameters = {
            "startDate": start_date,
            "endDate": end_date
        }

        response = requests.get(url, headers = headers, params = parameters)
        data = response.json()

        if "data" not in data or len(data["data"]) == 0:
            print(f"no audience data for {artist_name}")
            return None
        
        df = pd.DataFrame(data["data"])
        df["artist"] = artist_name
        df["retrieved_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return df
    
    except Exception as e:
        print(f"buddy could not get the data  for {artist_name}: {e}")
        return None
    
                                                     
def get_artist_sentiment(artist_name, number_of_posts = 15, time_filter = "week"):
    sentiments = []
    examples = {"positive": [], "negative": []}
    processed_ids = set()
    seen_comments = set()

    subreddits = [
        "music", "popheads", "hiphopheads", "indieheads",
        "popculturechat", "GenZ", "morbidquestions", "askreddit",
        "hiphopcirclejerk", "blackpeopletwitter", "letstalkmusic"
    ]
    np.random.shuffle(subreddits)

    keywords = [
        "song", "album", "music", "track", "listen", "release", "stream",
        "concert", "lyrics", "vocals", "fanbase", "chart", "award",
        "cancel", "controversy", "drama", "problematic", "backlash",
        "scandal", "beef", "apology", "offend", "allegation", "lawsuit",
        "accused", "critic", "criticize"
    ]

    def is_relevant(text):
        if not text:
            return False
        lower = text.lower()
        artist_lower = artist_name.lower()

        if artist_lower in lower:
            return True

        for i in keywords:
            if i in lower:
                return True

        return False

    try:

        for sub in subreddits:
            if len(processed_ids) >= number_of_posts:
                break

            try:
                search_results = reddit.subreddit(sub).search(
                    artist_name,
                    limit = 5,           
                    sort= "new",
                    time_filter = time_filter
                )
            except Exception:
                continue

            for submission in search_results:
                if submission.id in processed_ids:
                    continue
                processed_ids.add(submission.id)

            
                title_text = (submission.title or "") + " " + (getattr(submission, "selftext", "") or "")
                if not is_relevant(title_text):
                    continue

                try:
                    polarity = TextBlob(submission.title).sentiment.polarity
                except Exception:
                    polarity = 0.0
                sentiments.append(polarity)

                if polarity > 0.3 and len(examples["positive"]) < 2:
                    examples["positive"].append(submission.title)
                elif polarity < -0.3 and len(examples["negative"]) < 2:
                    examples["negative"].append(submission.title)

                submission.comments.replace_more(limit = 0)
                comment_count = 0

                for comment in submission.comments.list():
                    if comment_count >= 5:
                        break
                    body = getattr(comment, "body", "")
                    if not body or len(body.strip()) < 20:
                        continue
                    if body in seen_comments:
                        continue
                    if not is_relevant(body):
                        continue

                    seen_comments.add(body)
                    try:
                        cpol = TextBlob(body).sentiment.polarity
                    except Exception:
                        cpol = 0.0
                    sentiments.append(cpol)

                    if cpol > 0.3 and len(examples["positive"]) < 4:
                        examples["positive"].append(body[:200])
                    elif cpol < -0.3 and len(examples["negative"]) < 4:
                        examples["negative"].append(body[:200])

                    comment_count += 1
                    time.sleep(0.08)  

                time.sleep(0.25)

    except Exception as e:
        print(f"Reddit scraping error: {e}")
        return np.nan, examples

    if not sentiments:
        return np.nan, examples

    std_sentiment = float(np.std(sentiments))
    controversy_score = min(std_sentiment * 2, 1.0)
    return round(controversy_score, 3), examples


def trend_detector(artist_name, timeframe = "today 12-m"):
    from random import uniform

    try:
        pytrends = TrendReq(hl = "en-US", tz = 360)
        
        #google keeps rate limiting me
        for i in range(3):

            try:
                pytrends.build_payload([artist_name], cat = 0, timeframe = timeframe, geo = "", gprop = "")
                df = pytrends.interest_over_time()

                if df.empty or artist_name not in df.columns:
                    print(f"no trend data 4 {artist_name}")
                    return None
                
                interest = df[artist_name]
                spike_date = interest.idxmax().strftime("%Y-%m-%d")
                spike_value = interest.max()
                print(artist_name, spike_date)
                return spike_date
            
            except Exception as nested_e:
                if "429" in str(nested_e):
                    wait_time = uniform(10, 30)
                    print(f"goody ahh google bruh")
                    time.sleep(wait_time)
                    continue
                else:
                    raise nested_e
        
        print(f"tried multiple times for {artist_name}, still doesnt work")
        return None
    
    except Exception as e:
        print(f"error getting {artist_name} as {e}")
        return None


def determine_controversy(artist_name, spike_date):
    start = (datetime.strptime(spike_date, "%Y-%m-%d") - timedelta(days = 3)).strftime("%Y-%m-%d")
    end = (datetime.strptime(spike_date, "%Y-%m-%d") + timedelta(days = 3)).strftime("%Y-%m-%d")

    score, examples = get_artist_sentiment(artist_name, time_filter = "week")

    keywords = KEYWORDS

    joined_text = " ".join(examples["negative"] + examples["positive"]).lower()

    if any(word in joined_text for word in keywords) and score > 0.4:
        label = "controversy"
    else:
        label = "hype"
    
    print(f"{spike_date}'s trend for {artist_name} is {label.upper()}")
    return label



def compare_trends(artist_name):

    controversy_date = trend_detector(artist_name)
    if not controversy_date:
        print(f"no controversy found for {artist_name}")
        return None
    
    label = determine_controversy(artist_name, controversy_date)
    if label != "controversy":
        print("this is just hype, skipping")
        return None
    
    start_date = (datetime.strptime(controversy_date, "%Y-%m-%d") - timedelta(days = 30)).strftime("%Y-%m-%d")
    end_date = (datetime.strptime(controversy_date, "%Y-%m-%d") + timedelta(days = 30)).strftime("%Y-%m-%d")

    before_score, before_examples = get_artist_sentiment(artist_name, time_filter = "month")
    after_score, after_examples = get_artist_sentiment(artist_name, time_filter = "week")

    df = historical_listener_data(artist_name, start_date, end_date)
    if df is None or df.empty:
        print(f"could not find past listenr count for {artist_name}")
        return None
    
    before_df = df[df["date"] < controversy_date]
    after_df = df[df["date"] > controversy_date]

    if before_df.empty or after_df.empty:
        print(f"Not enough data for {artist_name}")
        return None
    
    before_listeners = before_df["value"].mean()
    after_listeners = after_df["value"].mean()

    return {
        "artist": artist_name,
        "controversy_date": controversy_date,
        "label": label,
        "before_listeners": before_listeners,
        "after_listeners": after_listeners,
        "before_score": before_score,
        "after_score": after_score,
        "difference": after_listeners - before_listeners,
        "percent_change": ((after_listeners - before_listeners) / before_listeners) * 100
    }


def batch_analysis(artists, save_path="artist_controversy_data.csv"):
    results = []
   
    if os.path.exists(save_path):
        existing_df = pd.read_csv(save_path)
        existing_artists = set(existing_df["artist"].dropna().unique())
    else:
        existing_df = None
        existing_artists = set()

    for artist in artists:
        if artist in existing_artists:
            continue

        try:
            result = compare_trends(artist)
            if result:
                result["timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                results.append(result)
            else:
                print(f"cannot use for  {artist}.")
        except Exception as e:
            print(f"error  {artist}: {e}")
            continue

        time.sleep(3)

    if not results:
        print("No new results .")
        return None

    new_df = pd.DataFrame(results)

    if existing_df is not None:
        df = pd.concat([existing_df, new_df], ignore_index = True)
    else:
        df = new_df


    if "label" in df.columns and "percent_change" in df.columns:
        hype = df[df["label"] == "hype"]["percent_change"].dropna()
        controversy = df[df["label"] == "controversy"]["percent_change"].dropna()

        if len(hype) >= 2 and len(controversy) >= 2:
            t, p = ttest_ind(hype, controversy, equal_var = False)
            avg_hype = hype.mean()
            avg_cont = controversy.mean()
            summary = pd.DataFrame([{
                "artist": "SUMMARY",
                "controversy_date": datetime.now().strftime("%Y-%m-%d"),
                "label": "aggregate",
                "before_listeners": np.nan,
                "after_listeners": np.nan,
                "before_score": np.nan,
                "after_score": np.nan,
                "difference": np.nan,
                "percent_change": np.nan,
                "avg_hype_change": avg_hype,
                "avg_controversy_change": avg_cont,
                "t_statistic": t,
                "p_value": p,
                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }])
            df = pd.concat([df, summary], ignore_index=True)
        else:
            print("need more data")
    else:
        print("missing colummn")

    df.to_csv(save_path, index=False)
    return df



def statistical_summary(csv_path = "artist_controversy_data.csv"):
    df = pd.read_csv(csv_path)
    df["difference_norm"] = df["difference"]/df["before_listeners"]

    print("\n avg  listnerer change (normallize by difference divided by before listeners)")
    print(df.groupby("label")["difference_norm"].mean())

    hype = df[df["label"] == "hype"]["difference_norm"]
    controversy = df[df["label"] == " controversy"]["difference_norm"]

    if len(hype) > 2 and len(controversy) > 2:
        t, p = ttest_ind(hype, controversy, equal_var = False)
        print(f"\n T = {t:.3f}, p = {p:.5f}")
    else:
        print("need more data ")

def get_top_artists_lastfm(api_key, limit=100):
    url = "http://ws.audioscrobbler.com/2.0/"
    parameters = {
        "method": "chart.gettopartists",
        "api_key": api_key,
        "format": "json",
        "limit": limit
    }
    response = requests.get(url, params = parameters)
    data = response.json()

    if "artists" not in data or "artist" not in data["artists"]:
        print("couldn't fetch top artists")
        return []

    artist_names = [a["name"] for a in data["artists"]["artist"]]
    np.random.shuffle(artist_names)  
    return artist_names

def main():
    api_key = keys["LASTFM_API_KEY"]
    artists = get_top_artists_lastfm(api_key, limit=100)
    
    print(f"Selected {len(artists)} artists for analysis.")
    df = batch_analysis(artists)
    statistical_summary("artist_controversy_data.csv")

if __name__ == "__main__":
    main()
                                                        



KeyboardInterrupt: 