# Script to match a Spotify song with a Youtube video

* Data needs to be in CSV format
* Artist represents a list of all the artists contributing to that song
* In this example I use the video matching to get the view count

In [None]:
from __future__ import print_function    # (at top of module)
import warnings
#warnings.filterwarnings('always')
from spotipy.oauth2 import SpotifyClientCredentials
import json
import spotipy
import time
import sys
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams['figure.figsize'] = [10, 10]
from matplotlib.pyplot import figure
import math
import seaborn as sns
import config
import requests
import urllib

fileName = "google_cleaned_data.csv"

In [None]:
data = pd.read_csv(fileName ,encoding='utf-8')
data = data.drop_duplicates(subset=['song_id'], keep='first')
#data = data.drop_duplicates(subset=['song_title'], keep='first')
# delete all rows with column 'Age' has value 30 to 40 
#indexNames = data[ (data['popularity'] < 50) | (data['total_no_streams'] < 1000000) ].index
#data.drop(indexNames , inplace=True)
print("Number of entries in original data after cleaning: " + str(len(data.index)))

In [None]:
# DEBUGGING : print statements are commented out

# Build the query from the data
temp_query = ""
youtube_view_counts = []
youtube_video_titles = []
total_data_length = str(len(data.index))
cur_index = 1

# For each row in the dataframe
for index, row in data.iterrows():
    
    # Progress indicator
    print(cur_index, "/", total_data_length)
    cur_index += 1
    
    # Build the query as song_title + artists (from the Spotify API)
    temp_query += row['song_title'].replace(',', "") + " " + row['artist'].replace('[', "").replace(']', "").replace(',', "")
    
    #print("===============")
    #print(temp_query)
    
    params_for_query = {"part": "snippet",
                        "maxResults": 5,
                        "order": "relevance",
                        "pageToken": "",
                        "q": temp_query,
                        "key": config.youtube_client_secret,
                        "type": "video",
                        }
    url1 = "https://www.googleapis.com/youtube/v3/search"
    
    # Get top 5 video results ordered by relevance
    # TODO: Still some matching problems - how to get the most relevant video with the most views from official sources?
    page_query = requests.request(method="get", url=url1, params=params_for_query)
    j_results_query = json.loads(page_query.text)
    
    #print("QUERY ============================")
    
    max_viewcount = 0
    max_viewcount_title = ""
    
    # Iterate through the first 5 results to find the relevant video with most views
    for item in j_results_query['items']:
    
        
        #print("Video title: ", item['snippet']['title'])
        
        # Video title of a video to match the query string
        video_title = item['snippet']['title']

        
        #print("Video id: ", item['id']['videoId'])
        
        # Get the video ID of a video to match the query string
        video_id = item['id']['videoId']

        # Next, using this video ID we need to get the view count of that video
        params_for_stats = {"part": "statistics",
                            "id": video_id,
                            "key": config.youtube_client_secret,
                           }
        url2 = "https://www.googleapis.com/youtube/v3/videos"
        page_stats = requests.request(method="get", url=url2, params=params_for_stats)
        j_results_stats = json.loads(page_stats.text)

        # Check if problem with retrieving statistics
        if 'items' in j_results_stats:
            
            #print("Video view count: ", j_results_stats['items'][0]['statistics']['viewCount'])
            
            # Get the view count
            view_count  = j_results_stats['items'][0]['statistics']['viewCount']
            
            # This is my hacky matching condition
            # The video we want to match to our spotify song has to be the version of the song with the most views
            # BUT we also need to make sure that it is the same song
            # The .split() gets rid of some useless parts in a song title from spotify such as this
            # e.g. Spotify title = "Eastside (feat. Elton John)" - > "Eastside"
            # e.g. If we want the matching song for "Ariana Grande - thank u, next" and the results from the YT API are 
            #           1. "Ariana Grande - thank u, next (1 million views)"
            #           2. "Ariana Grande - breathin (5 million views)"
            # We would be tempted to just take the one with more views but in fact we need to double check so we only update the max
            # if the string "thank u, next" is also in the youtube video title
            # Hope this makes sense :) 
            if int(view_count) > max_viewcount and row['song_title'].split('(', 1)[0].split('-', 1)[0].rstrip().lower() in video_title.lower():
                max_viewcount = int(view_count)
                max_viewcount_title = video_title
            
    #print("CHOICE IS: ", max_viewcount_title, " ---- ", max_viewcount)
    
    # Keep the counts in a list
    youtube_view_counts.append(int(max_viewcount))
    
    #Keep titles in a list
    # Commas in the title would interefere with the CSV format so we get rid of them, also [] because they are list notation
    youtube_video_titles.append(max_viewcount_title.replace(",", "").replace('[','(').replace(']', ')'))
    
    temp_query = ""
    
# Add the two new columns to the data
data['youtube_view_count'] = youtube_view_counts
data['youtube_video_title'] = youtube_video_titles

print()
print("Done getting data.")

In [None]:
data.to_csv("data_try_algo_nn.csv", encoding="utf-8", header=True, index=False)