In [None]:

#Version 3 (NO paid API Yet) (Lol waiting for club funding!)

import requests
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import praw
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from textblob import TextBlob
import pytrends
from pytrends.request import TrendReq
from dotenv import load_dotenv
import os
from scipy.stats import ttest_ind
from datetime import datetime, timedelta
import warnings
import logging
import gspread
from gspread_dataframe import set_with_dataframe
from google.oauth2.service_account import Credentials
import random
pytrends = TrendReq(hl="en-US", tz=360)
warnings.filterwarnings("ignore", category = FutureWarning)

load_dotenv()

google_credentials = os.getenv("GOOGLE_CREDENTIALS_PATH")
spotify_client_id = os.getenv("SPOTIFY_CLIENT_ID")
spotify_client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")
reddit_client_id = os.getenv("REDDIT_CLIENT_ID")
reddit_client_secret = os.getenv("REDDIT_CLIENT_SECRET")

logging.basicConfig(
    filename = "run_log.txt",
    level = logging.INFO,
    format = "%(asctime)s - %(levelname)s - %(message)s" 
)
random.seed(42)
np.random.seed(42)



def fetch_reddit_posts(subreddit_name, limit = 100, sort = "hot", reddit = None):

    print(f"searchying through r/{subreddit_name}")
    posts_data =[]
    subreddit = reddit.subreddit(subreddit_name)

    sort_map = {
        "hot": subreddit.hot,
        "new": subreddit.new,
        "top": subreddit.top,
        "rising": subreddit.rising
    }

    submissions = sort_map.get(sort, subreddit.hot)(limit = limit)

    for submission in submissions:
        try:
            posts_data.append({
                "post_id": submission.id,
                "title": submission.title,
                "score": submission.score,
                "num_comments": submission.num_comments,
                "created_utc": submission.created_utc,
                "upvote_ratio": submission.upvote_ratio,
                "selftext": submission.selftext,
                "url": submission.url
            })

        except Exception as e:
            print(f"error w/ {submission.id}: {e}")
            time.sleep(2)
        
    df = pd.DataFrame(posts_data)
    print(f"collected{len(df)} posts")
    return df
    
def get_comment_for_post(post_id, reddit, max_comments = 50):
    comments_texts = []
    try:
        submission = reddit.submission(id = post_id)
        submission.comments.replace_more(limit = 0)

        for comment in submission.comments[:max_comments]:
            if not comment.body or comment.body in ["[deleted]", "[removed]"]:
                continue
            if len(comment.body.split()) < 3:
                continue
            comments_texts.append(comment.body)

    except Exception as e:
        print(f"error getting comments for {post_id} {e}")
        time.sleep(3)

    return comments_texts
        
def analyze_sentiment(text):

    try:
        blob = TextBlob(text)
        return blob.sentiment.polarity, blob.sentiment.subjectivity
    except Exception as e:
        print("sentiment error probabbly will never get this error though")
        return np.nan, np.nan

def compute_controversy_score(upvote_ratio, num_comments):
    try:
        return (1 - upvote_ratio) * np.log1p(num_comments)
    except Exception:
        return np.nan

def compile_data(reddit, subreddit_name, post_limit = 100, max_comments = 50): 

    posts_df = fetch_reddit_posts(subreddit_name, limit = post_limit, reddit = reddit)

    all_comments = []
    print("\ngetting comments and analyzing sentiment")
    for i, row in posts_df.iterrows():
        comments = get_comment_for_post(row["post_id"], reddit, max_comments = max_comments)
        joined_comments = " ".join(comments)
        polarity, subjectivity = analyze_sentiment(joined_comments)
        controversy = compute_controversy_score(row["upvote_ratio"], row["num_comments"])

        all_comments.append({
            "post_id": row["post_id"], 
            "comments_text": joined_comments,
            "sentiment_polarity": polarity,
            "sentiment_subjectivity": subjectivity,
            "controversy_score": controversy
        })

        if i % 10 == 0:
            print("all posts processed")
        
        time.sleep(1.5)
    
    comments_df = pd.DataFrame(all_comments)
    final_df = pd.merge(posts_df, comments_df, on = "post_id", how = "left")
    return final_df


def log_metadata(artist_name, n_posts, n_comments, duration):
    meta_path = "sampling.log.csv" 
    new_entry = pd.DataFrame([{
        "artist": artist_name,
        "n_posts": n_posts,
        "n_comments": n_comments,
        "duration_sec": duration,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }])

    if os.path.exists(meta_path):
        existing = pd.read_csv(meta_path)
        df = pd.concat([existing, new_entry], ignore_index = True)
    else:
        df = new_entry
    df.to_csv(meta_path, index = False)

def bootstrap_ci(data, n_resamples = 10000, ci = 95):
    if len(data) < 2:
        return (np.nan, np.nan)
    resamples = np.random.choice(data, (n_resamples, len(data)), replace = True)
    means = resamples.mean(axis = 1)
    lower = np.percentile(means, (100 - ci) / 2) 
    upper = np.percentile(means, 100 - (100 - ci) / 2)
    return (lower, upper)



KEYWORDS = [
    "drama", "cancelled", "racist", "assault", "lawsuit", "controversy", "apology",
    "accused", "kill", "murder", "over", "cooked", "horrible", "trash",
    "criticism", "disrespect", "problematic", "backlash", "allegation", "scandal"
]



with open("config.json") as f:
    keys = json.load(f)

spotify = spotipy.Spotify(
    auth_manager = SpotifyClientCredentials(
        client_id = keys["SPOTIFY_CLIENT_ID"],
        client_secret = keys["SPOTIFY_CLIENT_SECRET"]
    )
)

reddit = praw.Reddit(
    client_id = keys["REDDIT_CLIENT_ID"],
    client_secret = keys["REDDIT_CLIENT_SECRET"],
    user_agent = "music_sentiment_bot by u/Many-Lingonberry1688"
)

print(keys.keys())

SOUNDCHARTS_KEY = keys["SOUNDCHARTS_API_KEY"]
BASE_URL = "https://api.soundcharts.com/api/v2" 

headers = {
    "Authorization": f"Bearer {SOUNDCHARTS_KEY}"
}


#ai this bs cus the api was being annoying 
def get_artist_id(artist_name, max_retries=5):
    url = f"{BASE_URL}/artist/search"
    parameters = {"q": artist_name}

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, params=parameters, timeout=10)
            if response.status_code == 429:
                wait_time = 60 * (attempt + 1)
                print(f"Rate limited when fetching ID for {artist_name}, waiting {wait_time}s...")
                time.sleep(wait_time)
                continue
            response.raise_for_status()

            data = response.json()
            if "items" not in data or len(data["items"]) == 0:
                print(f"{artist_name} not found in Soundcharts.")
                return None

            return data["items"][0]["id"]

        except Exception as e:
            wait_time = 10 * (attempt + 1)
            print(f"Error getting artist ID for {artist_name}: {e}. Retrying in {wait_time}s...")
            time.sleep(wait_time)

    print(f"Failed to get artist ID for {artist_name} after {max_retries} attempts.")
    return None


def historical_listener_data(artist_name, start_date, end_date, max_retries=5):
    artist_id = get_artist_id(artist_name)
    if not artist_id:
        return None

    url = f"{BASE_URL}/artist/{artist_id}/audience/streaming"
    parameters = {"startDate": start_date, "endDate": end_date}

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, params=parameters, timeout=10)
            if response.status_code == 429:
                wait_time = 60 * (attempt + 1)
                print(f"Rate limited fetching data for {artist_name}, waiting {wait_time}s...")
                time.sleep(wait_time)
                continue
            response.raise_for_status()

            data = response.json()
            if "data" not in data or len(data["data"]) == 0:
                print(f"No audience data for {artist_name}.")
                return None

            df = pd.DataFrame(data["data"])
            df["artist"] = artist_name
            df["retrieved_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            return df

        except Exception as e:
            wait_time = 10 * (attempt + 1)
            print(f"Error fetching data for {artist_name}: {e}. Retrying in {wait_time}s...")
            time.sleep(wait_time)

    print(f"Failed to retrieve data for {artist_name} after {max_retries} attempts.")
    return None



                                                     
def get_artist_sentiment(artist_name, number_of_posts = 50, time_filter = "week"):
    start_time = time.time()
    sentiments = []
    examples = {"positive": [], "negative": []}
    processed_ids = set()
    seen_comments = set()
    n_posts = 0
    n_comments = 0

    subreddits = [
        "music", "popheads", "hiphopheads", "indieheads",
        "popculturechat", "GenZ", "morbidquestions", "askreddit",
        "hiphopcirclejerk", "blackpeopletwitter", "letstalkmusic"
    ]
    np.random.shuffle(subreddits)

    keywords = [
        "song", "album", "music", "track", "listen", "release",
        "concert", "lyrics", "vocals", "fanbase", "chart", "award",
        "cancel", "controversy", "drama", "problematic", "backlash",
        "scandal", "beef", "apology", "offend", "allegation", "lawsuit",
        "accused", "critic", "criticize"
    ]

    def is_relevant(text):
        if not text:
            return False
        lower = text.lower()
        return artist_name.lower() in lower or any(k in lower for k in keywords)
    
    try:
        for sub in subreddits:
            if n_posts >= number_of_posts:
                break

            for submission in reddit.subreddit(sub).search(
                artist_name, limit = 5, sort = "new", time_filter = time_filter 
            ):
                if submission.id in processed_ids:
                    continue
                processed_ids.add(submission.id)

                title_text = (submission.title or "") + " " + (getattr(submission, "selftext", "") or "")
                if not is_relevant(title_text):
                    continue

                try:
                    polarity = TextBlob(title_text).sentiment.polarity
                except Exception:
                    polarity = 0.0
                sentiments.append(polarity)
                n_posts += 1

                if polarity > 0.3 and len(examples["positive"]) < 2:
                    examples["positive"].append(submission.title)
                elif polarity < -0.3 and len(examples["negative"]) < 2:
                    examples["negative"].append(submission.title)
                
                submission.comments.replace_more(limit = 0)
                comment_count = 0

                for comment in submission.comments:
                    if comment_count >= 3:
                        break
                    body = getattr(comment, "body", "")
                    if not body or len(body.strip()) < 20 or body in seen_comments:
                        continue
                    if not is_relevant(body):
                        continue
                    seen_comments.add(body)
                    
                    try:
                        cpol = TextBlob(body).sentiment.polarity
                    except Exception:
                        cpol = 0.0
                    sentiments.append(cpol)
                    n_comments += 1
                    if cpol > 0.3 and len(examples["positive"]) < 4:
                        examples ["positive"].append(body[:200])
                    elif cpol <-0.3 and len(examples["negative"]) < 4:
                        examples["negative"].append(body[:200])
                    comment_count += 1
                    time.sleep(random.uniform(0.5, 1.5))
                time.sleep(0.1)
    except Exception as e:
        print(f"reddit api error: {e}")
        return np.nan, examples
    
    duration = round(time.time() - start_time, 2)
    log_metadata(artist_name, n_posts, n_comments, duration)

    if not sentiments:
        return np.nan, examples
    
    std_sentiment = float(np.std(sentiments))
    mean_sentiment = float(np.mean(sentiments))
    controversy_score = min(std_sentiment * 2, 1.0)

    examples["n_posts"] = n_posts
    examples["n_comments"] = n_comments
    examples["mean_sentiment"] =mean_sentiment

    return round(controversy_score, 3), examples 


def trend_detector(artist_name, timeframe = "today 12-m"):
    for attempt in range(5):
        try:
            print(f"detecting trend for {artist_name}")
            pytrends.build_payload([artist_name], cat = 0, timeframe = timeframe, geo ="", gprop ="")
            df = pytrends.interest_over_time()

            if df.empty:
                print(f"nothing found for {artist_name}, skipping")
                return None

            max_interest = df[artist_name].max()
            spike_date = df[df[artist_name] == max_interest].index[-1]
            print(f" spike detected for {artist_name} on {spike_date.strftime('%Y-%m-%d')}")
            time.sleep(random.uniform(15, 45))
            return spike_date.strftime("%Y-%m-%d")
        
        except Exception as e:
            print(f"error getting {artist_name} as {e}")
            wait_time = 30 * (attempt + 1)
            print(f"waiting {wait_time} before trying again probably blocked or sum")
            time.sleep(wait_time)

    print(f"failed to get data for {artist_name} even after multiple attempts")
    return None


def determine_controversy(artist_name, spike_date):
    try:
        start = (datetime.strptime(spike_date, "%Y-%m-%d") - timedelta(days= 3)).strftime("%Y-%m-%d")
        end = (datetime.strptime(spike_date, "%Y-%m-%d") + timedelta(days= 3)).strftime("%Y-%m-%d")
    except Exception:
        start = end = spike_date
    
    score, examples = get_artist_sentiment(artist_name, time_filter = "week")
    if isinstance(examples, dict):
        mean_sent = examples.get("mean_sentiment", 0.0)
        n_posts = examples.get("n_posts", 0)
        n_comments = examples.get("n_comments", 0)
    else:
        mean_sent = 0.0
        n_posts = 0
        n_comments = 0
    
    joined_text = ""
    if isinstance(examples, dict):
        joined_text = " ".join(examples.get("negative", []) + examples.get("positive", []))
    joined_lower = joined_text.lower()

    keyword_count = 0
    for k in KEYWORDS:
        if k in joined_lower:
            keyword_count += 1

    if isinstance(examples, dict):
        negative_list = examples.get("negative", [])
        n_negative_examples = len(negative_list)
    else:
        n_negative_examples = 0 

    SCORE_THRESH = 0.25
    KEYWORD_COUNT_THRESH = 2
    MEAN_SENT_THRESH = -0.12
    NEG_EXAMPLES_THRESH = 2

    is_strong_score = (isinstance(score, (int, float)) and score >= SCORE_THRESH)
    is_keyword_heavy = (keyword_count >= KEYWORD_COUNT_THRESH)
    is_mean_negative = (isinstance(mean_sent, (int, float)) and mean_sent <= MEAN_SENT_THRESH)
    has_negative_examples = (n_negative_examples >= NEG_EXAMPLES_THRESH)

    if is_strong_score:
        label = "controversy"
    elif is_keyword_heavy:
        label = "controversy" 
    elif is_mean_negative and (has_negative_examples or keyword_count >= 1):
        label = "controversy"
    elif has_negative_examples and (score >= (SCORE_THRESH * 0.6)):
        label = "controversy"
    else:
        label = "hype"
    
    print(
        f" determine_controversy {artist_name} spike on {spike_date} "
        f"artist scores: score ={score} mean_sent={mean_sent:.3f} keywords={keyword_count} "
        f"artist examples: neg_examples = {n_negative_examples} n_posts ={n_posts} n_comments ={n_comments} => {label.upper()}"
    )

    logging.info(f"determine_controversy: {artist_name} {spike_date} label={label} score={score} "
                 f"mean_sent={mean_sent:.3f} keywords={keyword_count} neg_ex={n_negative_examples}")
    
    return label
    


def statistical_summary(csv_path = "artist_controversy_data.csv"):
    df = pd.read_csv(csv_path)
    df["difference_norm"] = df["difference"] / df["before_listeners"]

    print("\n normalized listening change by label:")
    print(df.groupby("label")["difference_norm"].describe(percentiles = [0.25, .5, .75]))

    hype = df[df["label"] == "hype"]["difference_norm"].dropna()
    controversy = df[df["label"] == "controversy"]["difference_norm"].dropna()

    if len(hype) > 2 and len(controversy) > 2:
        t, p = ttest_ind(hype, controversy, equal_var = False)
        ci_hype = bootstrap_ci(hype)
        ci_cont = bootstrap_ci(controversy)
        print(f"t-test: T = {t:.3f}, p = {p:.5f}")
        print(f"Hype average plus or minus CI: {hype.mean():.3f} ({ci_hype[0]:.3f}, {ci_hype[1]:.3f})")
        print(f"Controversy average plus or minus CI: {controversy.mean():.3f} ({ci_cont[0]:.3f}, {ci_cont[1]:.3f})")
    else:
        print("lacking data")

def get_top_artists_lastfm(api_key, limit=100):
    url = "http://ws.audioscrobbler.com/2.0/"
    parameters = {
        "method": "chart.gettopartists",
        "api_key": api_key,
        "format": "json",
        "limit": limit
    }
    response = requests.get(url, params = parameters)
    data = response.json()

    if "artists" not in data or "artist" not in data["artists"]:
        print("couldn't fetch top artists")
        return []

    artist_names = [a["name"] for a in data["artists"]["artist"]]
    np.random.shuffle(artist_names)  
    return artist_names

def antidupe_and_merge(new_df, save_path):
    if os.path.exists(save_path):
        existing = pd.read_csv(save_path)
        combined = pd.concat([existing, new_df], ignore_index = True)
        combined.drop_duplicates(subset=["artist", "spike_date"], keep = "last", inplace = True)
    else:
        combined = new_df
    
    combined.to_csv(save_path, index = False)

    print(f"information saved and removed duplicates to {save_path}")
    return combined

def export_to_google_sheets(df, sheet_name = "Music Economics Data"):
    json_path = google_credentials
    creds = Credentials.from_service_account_file(json_path)
    client = gspread.authorize(creds)
    sheet = client.open(sheet_name).sheet1

    existing_records = len(sheet.get_all_values())
    if existing_records > 0:
        start_row = existing_records + 1
    else:
        start_row = 1
    
    set_with_dataframe(sheet, df, row = start_row, include_column_header = (existing_records == 0))
    print("changes applied to google sheets")




def main():
    save_path = "artist_controversy_data.csv" 
    api_key = keys["LASTFM_API_KEY"]

    if os.path.exists(save_path):
        existing_df = pd.read_csv(save_path)
        existing_artists = set(existing_df["artist"])
    else:
        existing_df = pd.DataFrame()
        existing_artists = set()

    all_artists = get_top_artists_lastfm(api_key, limit = 100)
    new_artists = ["Kendrick Lamar", "d4vid", "Drake"]
    for i in all_artists:
        if not i in existing_artists:
            new_artists.append(i)
    
    random.shuffle(new_artists)
    artists = new_artists[:10]
    
    all_results = []

    for artist in artists:

        spike_date = trend_detector(artist)
        if not spike_date:
            print(f"no trend found for {artist}, skipping")
            continue

        label = determine_controversy(artist, spike_date)
        if label == "hype":
            print(f"{artist} classified as hype")
        
        start_date = (datetime.strptime(spike_date, "%Y-%m-%d") - timedelta(days=30)).strftime("%Y-%m-%d")
        end_date = (datetime.strptime(spike_date, "%Y-%m-%d") + timedelta(days=30)).strftime("%Y-%m-%d")
        listener_df = historical_listener_data(artist, start_date, end_date)

        if listener_df is None or listener_df.empty:
            print(f"no listener data found for {artist}")
            continue

        before_score, _ = get_artist_sentiment(artist, time_filter = "month")
        after_score, _ = get_artist_sentiment(artist, time_filter = "week")

        date_col = "date" if "date" in listener_df.columns else "day"
        before_mean = listener_df[listener_df[date_col] < spike_date]["value"].mean()
        after_mean = listener_df[listener_df[date_col] > spike_date]["value"].mean()


        if pd.isna(before_mean) or pd.isna(after_mean):
            print(f"incomplete listening data for {artist}, skipping")
            continue

        result = {
            "artist": artist,
            "spike_date": spike_date,
            "label": label,
            "before_listeners": before_mean,
            "after_listeners": after_mean,
            "before_score": before_score,
            "after_score": after_score,
            "difference": after_mean - before_mean,
            "percent_change": ((after_mean - before_mean) / before_mean) * 100,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

        all_results.append(result)
        time.sleep(3)
    
    if not all_results:
        print("no new data from this. nothing to save or export.")
        return
    
    

    df = pd.DataFrame(all_results)
    merged = antidupe_and_merge(df, save_path)
    statistical_summary(save_path)
    export_to_google_sheets(merged)

    print(f"saved and exportedr {len(df)}")



if __name__ == "__main__":
    main()
                                                        



dict_keys(['SPOTIFY_CLIENT_ID', 'SPOTIFY_CLIENT_SECRET', 'REDDIT_CLIENT_ID', 'REDDIT_CLIENT_SECRET', 'LASTFM_API_KEY', 'SOUNDCHARTS_API_KEY'])
detecting trend for Clairo
error getting Clairo as The request failed: Google returned a response with code 429
waiting 30 before trying again probably blocked or sum
detecting trend for Clairo
error getting Clairo as The request failed: Google returned a response with code 429
waiting 60 before trying again probably blocked or sum
detecting trend for Clairo
error getting Clairo as The request failed: Google returned a response with code 429
waiting 90 before trying again probably blocked or sum
detecting trend for Clairo
error getting Clairo as The request failed: Google returned a response with code 429
waiting 120 before trying again probably blocked or sum
detecting trend for Clairo
 spike detected for Clairo on 2025-04-13
 determine_controversy Clairo spike on 2025-04-13 artist scores: score =nan mean_sent=0.000 keywords=0 artist examples: 