**Spark notebook**

This notebook will only work in a Jupyter session running on `mathmadslinux2p`.

You can start your own Jupyter session on `mathmadslinux2p` and open this notebook in Chrome on the MADS Windows server by

1. Login to the MADS Windows server using https://mathportal.canterbury.ac.nz/.
2. Download or copy this notebook to your home directory.
3. Open powershell and run `ssh mathmadslinux2p`.
4. Run `start_pyspark_notebook` or `/opt/anaconda3/bin/jupyter-notebook --ip 132.181.129.68 --port $((8000 + $((RANDOM % 999))))`.
5. Copy / paste the url provided in the shell window into Chrome on the MADS Windows server.
6. Open the notebook from the Jupyter root directory (which is your home directory).
7. Run `start_spark()` to start a spark session in the notebook.
8. Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the Spark UI.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))


# Print function docstrings

help(start_spark)
help(stop_spark)
help(display_spark)
help(show_as_html)

Help on function start_spark in module __main__:

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)
    Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)

Help on function stop_spark in module __main__:

stop_spark()
    Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).

Help on function display_spark in module __main__:

display_spark()
    Display the status of the active Spark session if one is currently running.

Help on function show_as_html in module __main__:

show_as_html(df, n=20)
    Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n 

In [84]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.driver.memory,4g
spark.driver.extraJavaOptions,-Dderby.system.home=/tmp/ndu31/spark/
spark.executor.memory,4g
spark.app.id,app-20221031202254-0314
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.sql.warehouse.dir,file:/users/home/ndu31/spark-warehouse
spark.executor.cores,2


In [3]:
# Write your imports and code here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

In [None]:
#Question 1

In [28]:
!hdfs dfs -ls -R -h /data/msd/

drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio/attributes
-rw-r--r--   8 jsw93 supergroup      1.0 K 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv
-rw-r--r--   8 jsw93 supergroup        671 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv
-rw-r--r--   8 jsw93 supergroup        484 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv
-rw-r--r--   8 jsw93 supergroup        898 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv
-rw-r--r--   8 jsw93 supergroup        777 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv
-rw-r--r--   8 jsw93 supergroup        777 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all-v1.0.attributes.csv
-rw-r--r--   8 jsw93 supergroup    

-rw-r--r--   8 jsw93 supergroup     80.5 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00000.csv.gz
-rw-r--r--   8 jsw93 supergroup     80.6 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00001.csv.gz
-rw-r--r--   8 jsw93 supergroup     80.5 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00002.csv.gz
-rw-r--r--   8 jsw93 supergroup     80.5 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00003.csv.gz
-rw-r--r--   8 jsw93 supergroup     80.5 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00004.csv.gz
-rw-r--r--   8 jsw93 supergroup     80.6 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00005.csv.gz
-rw-r--r--   8 jsw93 supergroup     80.5 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00006.csv.gz
-rw-r--r--   8 jsw93 supergroup     76.8 M 2021-09-29 10:34 /data/msd/audio/features/msd-ssd-v1.0.csv/part-00007.csv.gz
drwxr-xr-x   - jsw93 supergroup 

In [70]:
!hdfs dfs -du -h -v /data/msd/audio/features/

SIZE     DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
65.5 M   524.2 M                                /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv
53.1 M   424.6 M                                /data/msd/audio/features/msd-jmir-lpc-all-v1.0.csv
35.8 M   286.5 M                                /data/msd/audio/features/msd-jmir-methods-of-moments-all-v1.0.csv
70.8 M   566.1 M                                /data/msd/audio/features/msd-jmir-mfcc-all-v1.0.csv
51.1 M   408.9 M                                /data/msd/audio/features/msd-jmir-spectral-all-all-v1.0.csv
51.1 M   408.9 M                                /data/msd/audio/features/msd-jmir-spectral-derivatives-all-all-v1.0.csv
412.2 M  3.2 G                                  /data/msd/audio/features/msd-marsyas-timbral-v1.0.csv
1.3 G    10.3 G                                 /data/msd/audio/features/msd-mvd-v1.0.csv
240.3 M  1.9 G                                  /data/msd/audio/features/msd-rh-v1.0

In [4]:
!hdfs dfs -ls /data/msd

Found 4 items
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/genre
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:28 /data/msd/main
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/tasteprofile


In [5]:
!hdfs dfs -du -h -v /data/

SIZE     DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
279.9 M  2.2 G                                  /data/crime
143.8 M  1.1 G                                  /data/fraud
15.8 G   126.5 G                                /data/ghcnd
1.9 K    7.5 K                                  /data/helloworld
1.2 G    9.4 G                                  /data/ml
12.9 G   103.5 G                                /data/msd
3.7 M    29.3 M                                 /data/openflights
19.1 M   152.9 M                                /data/shakespeare
0        0                                      /data/temp


In [6]:
!hdfs dfs -du -h -v /data/msd

SIZE     DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
12.3 G   98.1 G                                 /data/msd/audio
30.1 M   241.0 M                                /data/msd/genre
174.4 M  1.4 G                                  /data/msd/main
490.4 M  3.8 G                                  /data/msd/tasteprofile


In [10]:
!hdfs dfs -du -h -v /data/msd/audio/attributes

SIZE    DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
1.0 K   8.2 K                                  /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv
671     5.2 K                                  /data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv
484     3.8 K                                  /data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv
898     7.0 K                                  /data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv
777     6.1 K                                  /data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv
777     6.1 K                                  /data/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all-v1.0.attributes.csv
12.0 K  96.2 K                                 /data/msd/audio/attributes/msd-marsyas-timbral-v1.0.attributes.csv
9.8 K   78.0 K                                 /data/msd/audio/attributes/msd-mvd-v1.0.attri

In [18]:
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-marsyas-timbral-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-mvd-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-rh-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-rp-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-ssd-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-trh-v1.0.attributes.csv | head -n3
!hdfs dfs -cat /data/msd/audio/attributes/msd-tssd-v1.0.attributes.csv | head -n3

Area_Method_of_Moments_Overall_Standard_Deviation_1,real
Area_Method_of_Moments_Overall_Standard_Deviation_2,real
Area_Method_of_Moments_Overall_Standard_Deviation_3,real
LPC_Overall_Standard_Deviation_1,real
LPC_Overall_Standard_Deviation_2,real
LPC_Overall_Standard_Deviation_3,real
Method_of_Moments_Overall_Standard_Deviation_1,real
Method_of_Moments_Overall_Standard_Deviation_2,real
Method_of_Moments_Overall_Standard_Deviation_3,real
MFCC_Overall_Standard_Deviation_1,real
MFCC_Overall_Standard_Deviation_2,real
MFCC_Overall_Standard_Deviation_3,real
Spectral_Centroid_Overall_Standard_Deviation_1,real
Spectral_Rolloff_Point_Overall_Standard_Deviation_1,real
Spectral_Flux_Overall_Standard_Deviation_1,real
Spectral_Centroid_Overall_Standard_Deviation_1,real
Spectral_Rolloff_Point_Overall_Standard_Deviation_1,real
Spectral_Flux_Overall_Standard_Deviation_1,real
Mean_Acc5_Mean_Mem20_ZeroCrossings_HopSize512_WinSize512_Sum_AudioCh0,real
Mean_Acc5_Mean_Mem20_Centroid_Power_powerFFT_WinHammi

In [26]:
!hdfs dfs -du -h -v /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv

SIZE   DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
8.2 M  65.9 M                                 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00000.csv.gz
8.2 M  65.9 M                                 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00001.csv.gz
8.2 M  65.9 M                                 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00002.csv.gz
8.2 M  65.9 M                                 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00003.csv.gz
8.2 M  65.9 M                                 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00004.csv.gz
8.2 M  65.9 M                                 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00005.csv.gz
8.2 M  65.9 M                                 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00006.csv.gz
7.9 M  63.0 M                             

In [73]:
!hdfs dfs -cat /data/msd/audio/features/msd-mvd-v1.0.csv/part-00000.csv.gz |zcat | head -n10

% created mar/april 2012 by Vienna University of Technology (htpp://www.ifs.tuwien.ac.at/mir)
% feature vector version 1.0
% EXTRACTOR Matlab rp_extract v 0.6411 by tml
0.335626,0.293309,0.269617,0.274498,0.27573,0.136495,0.172581,0.138023,0.122016,0.117078,0.14321,0.122968,0.120988,0.119766,0.088223,0.104574,0.097663,0.096552,0.094822,0.145181,0.13806,0.221093,0.212537,0.103136,0.09823,0.09391,0.092932,0.102117,0.081064,0.089218,0.091501,0.082726,0.068686,0.10342,0.074859,0.071769,0.083853,0.083103,0.072527,0.083324,0.082416,0.093413,0.101421,0.178391,0.311757,0.154964,0.086146,0.088218,0.080622,0.058603,0.072397,0.077172,0.064931,0.070569,0.077536,0.074917,0.067045,0.060431,0.066273,0.057028,0.0844,0.057521,0.034129,0.052506,0.03966,0.007516,0.015773,0.007397,0.006701,0.008612,0.009319,0.005393,0.007201,0.007596,0.003088,0.006249,0.004489,0.004243,0.003426,0.00684,0.007613,0.036646,0.026974,0.005504,0.003797,0.004725,0.004632,0.004081,0.003407,0.003504,0.003481,0.00212,0.002255,0.

In [44]:
!hdfs dfs -cat /data/msd/audio/statistics/sample_properties.csv.gz | zcat | head -n3

track_id,title,artist_name,duration,7digita_Id,sample_bitrate,sample_length,sample_rate,sample_mode,sample_version,filesize
TRMMMYQ128F932D901,"Silent Night","Faster Pussy cat",252.05506,7032331,128,60.1935770567,22050,1,2,960887
TRMMMKD128F425225D,"Tanssi vaan",Karkkiautomaatti,156.55138,1514808,64,30.2244270016,22050,1,2,242038

gzip: stdout: Broken pipe
cat: Unable to write to output stream.


In [38]:
!hdfs dfs -du -h -v /data/msd/genre
!hdfs dfs -cat /data/msd/genre/msd-MAGD-genreAssignment.tsv | head -n3
!hdfs dfs -cat /data/msd/genre/msd-MASD-styleAssignment.tsv | head -n3
!hdfs dfs -cat /data/msd/genre/msd-topMAGD-genreAssignment.tsv | head -n3

SIZE    DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
11.1 M  88.7 M                                 /data/msd/genre/msd-MAGD-genreAssignment.tsv
8.4 M   67.3 M                                 /data/msd/genre/msd-MASD-styleAssignment.tsv
10.6 M  85.0 M                                 /data/msd/genre/msd-topMAGD-genreAssignment.tsv
TRAAAAK128F9318786	Pop_Rock
TRAAAAV128F421A322	Pop_Rock
TRAAAAW128F429D538	Rap
cat: Unable to write to output stream.
TRAAAAK128F9318786	Metal_Alternative
TRAAAAV128F421A322	Punk
TRAAAAW128F429D538	Hip_Hop_Rap
cat: Unable to write to output stream.
TRAAAAK128F9318786	Pop_Rock
TRAAAAV128F421A322	Pop_Rock
TRAAAAW128F429D538	Rap
cat: Unable to write to output stream.


In [40]:
!hdfs dfs -cat /data/msd/main/summary/analysis.csv.gz | zcat | head -n3

analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,idx_bars_confidence,idx_bars_start,idx_beats_confidence,idx_beats_start,idx_sections_confidence,idx_sections_start,idx_segments_confidence,idx_segments_loudness_max,idx_segments_loudness_max_time,idx_segments_loudness_start,idx_segments_pitches,idx_segments_start,idx_segments_timbre,idx_tatums_confidence,idx_tatums_start,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id
22050,aee9820911781c734e7694c5432990ca,0.0,252.05506,2.049,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0.777,-4.829,0,0.688,236.635,87.002,4,0.94,TRMMMYQ128F932D901
22050,ed222d07c83bac7689d52753610a513a,0.0,156.55138,0.258,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0.808,-10.555,1,0.355,148.66,150.778,1,0.0,TRMMMKD128F425225D

gzip: stdout: Broken pipe
cat: Unable to write to output stream.


In [41]:
!hdfs dfs -cat /data/msd/main/summary/metadata.csv.gz | zcat | head -n3


analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,artist_playmeid,genre,idx_artist_terms,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid
,4069,0.6498221002008776,0.3940318927141434,ARYZTJS1187B98C555,,,,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,44895,,0,0,Monster Ballads X-Mas,633681,0.5428987432910862,SOQMMHC12AB0180CB8,Silent Night,7032331
,113480,0.4396039666767154,0.3569921077564064,ARMVN3U1187FB3A1EB,,,,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,-1,,0,0,Karkuteillä,145266,0.2998774882739778,SOVFVAK12A8C1350D9,Tanssi vaan,1514808

gzip: stdout: Broken pipe
cat: Unable to write to output stream.


In [81]:
!hdfs dfs -cat /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt | head -n10

!hdfs dfs -cat /data/msd/tasteprofile/mismatches/sid_mismatches.txt | head -n3 

9d8
< ERROR: <SOFQHZM12A8C142342 TRMWMFG128F92FFEF2> Josipa Lisac  -  razloga  !=  Lisac Josipa  -  1000 razloga
19d17
< ERROR: <SODXUTF12AB018A3DA TRMWPCD12903CCE5ED> Lutan Fyah  -  Nuh Matter the Crisis Feat. Midnite  !=  Midnite  -  Nah Matter the Crisis
29d26
< ERROR: <SOASCRF12A8C1372E6 TRMHIPJ128F426A2E2> Gaetano Donizetti  -  L'Elisir d'Amore: Act Two: Come sen va contento!  !=  Gianandrea Gavazzeni_ Orchestra E Coro Del Maggio Musicale Fiorentino_ Carlo Bergonzi_ Renata Scotto  -  L'Elisir D'Amore_ Act 2: Come Sen Va Contento (Adina) (Donizetti)
33d29
< ERROR: <SOITDUN12A58A7AACA TRMHXGK128F42446AB> C.J. Chenier  -  Ay, Ai Ai  !=  Clifton Chenier  -  Ay_ Ai Ai
52d47
< ERROR: <SOLZXUM12AB018BE39 TRMRSOF12903CCF516> 許志安  -  男人最痛  !=  Andy Hui  -  Nan Ren Zui Tong
cat: Unable to write to output stream.
ERROR: <SOUMNSI12AB0182807 TRMMGKQ128F9325E10> Digital Underground  -  The Way We Swing  !=  Linkwood  -  Whats up with the Underground
ERROR: <SOCMRBE12AB018C546 TRMMREB12903CEB1B1

In [48]:
!hdfs dfs -du -h -v /data/msd/tasteprofile/triplets.tsv/


SIZE    DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
61.1 M  488.4 M                                /data/msd/tasteprofile/triplets.tsv/part-00000.tsv.gz
61.1 M  488.6 M                                /data/msd/tasteprofile/triplets.tsv/part-00001.tsv.gz
61.1 M  488.9 M                                /data/msd/tasteprofile/triplets.tsv/part-00002.tsv.gz
61.1 M  489.1 M                                /data/msd/tasteprofile/triplets.tsv/part-00003.tsv.gz
61.0 M  488.3 M                                /data/msd/tasteprofile/triplets.tsv/part-00004.tsv.gz
61.1 M  488.7 M                                /data/msd/tasteprofile/triplets.tsv/part-00005.tsv.gz
61.1 M  488.8 M                                /data/msd/tasteprofile/triplets.tsv/part-00006.tsv.gz
60.8 M  486.7 M                                /data/msd/tasteprofile/triplets.tsv/part-00007.tsv.gz


In [52]:
!hdfs dfs -cat /data/msd/audio/attributes/* | wc -l

3929


In [51]:
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-marsyas-timbral-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-mvd-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-rh-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-rp-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-ssd-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-trh-v1.0.attributes.csv | wc -l
!hdfs dfs -cat /data/msd/audio/attributes/msd-tssd-v1.0.attributes.csv | wc -l

21
21
11
27
17
17
125
421
61
1441
169
421
1177


In [87]:

!hdfs dfs -cat /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-jmir-lpc-all-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-jmir-methods-of-moments-all-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-jmir-mfcc-all-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-jmir-spectral-all-all-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-jmir-spectral-derivatives-all-all-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-marsyas-timbral-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-mvd-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-rh-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-rp-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-ssd-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-trh-v1.0.csv/* | zcat | wc -l
!hdfs dfs -cat /data/msd/audio/features/msd-tssd-v1.0.csv/* | zcat | wc -l


994623
994623
994623
994623
994623
994623
995001
994188
994188
994188
994188
994188
994188


In [61]:
!hdfs dfs -cat /data/msd/audio/statistics/sample_properties.csv.gz | zcat | wc -l

992866


In [63]:
!hdfs dfs -cat /data/msd/genre/msd-MAGD-genreAssignment.tsv | wc -l
!hdfs dfs -cat /data/msd/genre/msd-MASD-styleAssignment.tsv | wc -l
!hdfs dfs -cat /data/msd/genre/msd-topMAGD-genreAssignment.tsv | wc -l

422714
273936
406427


In [67]:
!hdfs dfs -cat /data/msd/main/summary/analysis.csv.gz | zcat | wc -l

!hdfs dfs -cat /data/msd/main/summary/metadata.csv.gz | zcat | wc -l

1000001
1000001


In [68]:
!hdfs dfs -cat /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt | wc -l


!hdfs dfs -cat /data/msd/tasteprofile/mismatches/sid_mismatches.txt | wc -l


!hdfs dfs -cat /data/msd/tasteprofile/triplets.tsv/* | zcat | wc -l

938
19094
48373586


In [88]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()