In [5]:
import json
import matplotlib.pyplot as plt
import numpy as np

# Load the JSON data from a file
with open('youtubeLogs.json', 'r') as file:
    data = json.load(file)

# Function to group logs by track name and collect counts over time
def group_logs_and_collect_counts(data):
    grouped_logs = {}
    for item in data:
        name = item.get('name', 'Unknown')
        log = item.get('youtubeLog', {})
        if name not in grouped_logs:
            grouped_logs[name] = []
        grouped_logs[name].append(log)
    
    # Sort logs by logTime and extract counts
    counts_by_name = {}
    for name, logs in grouped_logs.items():
        logs.sort(key=lambda x: x['logTime'])
        counts_by_name[name] = [(log['logTime'], log['primaryCounts']) for log in logs]
    
    return counts_by_name

# Function to remove records where count on day n+1 is smaller than on day n
def remove_invalid_records(counts_by_name):
    cleaned_counts_by_name = {}
    for name, counts in counts_by_name.items():
        cleaned_counts = []
        for i in range(len(counts) - 1):
            current_log_time, current_count = counts[i]
            next_log_time, next_count = counts[i + 1]
            # Keep the record if the next day's count is greater than or equal to the current day's count
            if next_count >= current_count:
                cleaned_counts.append((current_log_time, current_count))
        # Append the last record
        if counts:
            cleaned_counts.append(counts[-1])
        cleaned_counts_by_name[name] = cleaned_counts
    
    return cleaned_counts_by_name

# Function to calculate gradient and report unusual changes
def analyze_gradients(counts_by_name):
    unusual_changes = {}
    track_probabilities = {}

    for name, counts in counts_by_name.items():
        if len(counts) < 2:
            # Not enough data to calculate gradients
            continue
        
        log_times = np.array([log_time for log_time, _ in counts])
        log_counts = np.array([count for _, count in counts])
        
        # Ensure log_times is sorted
        if not np.all(np.diff(log_times) >= 0):
            raise ValueError(f"Log times for track '{name}' are not strictly increasing.")
        
        # Calculate the gradients
        gradients = np.gradient(log_counts, log_times)

        # Identify unusual changes
        avg_gradient = np.mean(gradients)
        std_gradient = np.std(gradients)
        threshold = avg_gradient + 6 * std_gradient
        
        unusual_points = []
        for i, gradient in enumerate(gradients):
            if gradient > threshold:
                unusual_points.append((log_times[i], log_counts[i], gradient))
        
        if unusual_points:
            unusual_changes[name] = unusual_points
            track_probability = len(unusual_points) / len(counts)
            track_probabilities[name] = track_probability
    
    return unusual_changes, track_probabilities

# Function to plot stream counts and highlight unusual points for tracks with unusual changes
def plot_stream_counts_with_unusual_points(cleaned_counts_by_name, unusual_changes):
    for idx, (name, unusual_points) in enumerate(unusual_changes.items(), start=1):
        counts = cleaned_counts_by_name[name]
        log_times = [log_time for log_time, _ in counts]
        log_counts = [count for _, count in counts]
        
        plt.figure(figsize=(10, 5))
        plt.plot(log_times, log_counts, label='Stream Counts', color='blue')
        
        # Highlight unusual points
        unusual_times = [time for time, _, _ in unusual_points]
        unusual_counts = [count for _, count, _ in unusual_points]
        plt.scatter(unusual_times, unusual_counts, color='red', label='Unusual Points')
        
        plt.title(f'Stream Counts for {name}')
        plt.xlabel('Log Time')
        plt.ylabel('Stream Count')
        plt.legend()
        
        # Save the plot as a JPG file with a unique name
        filename = f'youtube{idx}.jpg'
        plt.savefig(filename, format='jpg')
        plt.close()  # Close the plot to free up memory

# Group logs and collect counts
counts_by_name = group_logs_and_collect_counts(data)

# Clean the data by removing invalid records
cleaned_counts_by_name = remove_invalid_records(counts_by_name)

# Analyze gradients, report unusual changes, and calculate probability for each track with unusual changes
unusual_changes, track_probabilities = analyze_gradients(cleaned_counts_by_name)

# Print the probability of fake streams for each track with unusual changes
for name, probability in track_probabilities.items():
    print(f"Track '{name}' has a fake stream probability of {probability:.2f}")

# Plot stream counts and highlight unusual points only for tracks with unusual changes
plot_stream_counts_with_unusual_points(cleaned_counts_by_name, unusual_changes)


Track 'Back Door' has a fake stream probability of 0.02
Track 'Capital T' has a fake stream probability of 0.02
Track 'Deal' has a fake stream probability of 0.01
Track 'Gom o Goor' has a fake stream probability of 0.01
Track 'Hichvaght' has a fake stream probability of 0.01
Track 'LOVINOMORE' has a fake stream probability of 0.02
Track 'Maria' has a fake stream probability of 0.02
Track 'Merch' has a fake stream probability of 0.02
Track 'Nutella' has a fake stream probability of 0.01
Track 'Run' has a fake stream probability of 0.01
Track 'Tamoome' has a fake stream probability of 0.02
