# Musicians - Hard

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app16-3') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
band = sc.read.table('sqlzoo.band')
composer = sc.read.table('sqlzoo.composer')
composition = sc.read.table('sqlzoo.composition')
concert = sc.read.table('sqlzoo.concert')
has_composed = sc.read.table('sqlzoo.has_composed')
musician = sc.read.table('sqlzoo.musician')
performance = sc.read.table('sqlzoo.performance')
performer = sc.read.table('sqlzoo.performer')
place = sc.read.table('sqlzoo.place')
plays_in = sc.read.table('sqlzoo.plays_in')

## 11.
**List the name and town of birth of any performer born in the same city as James First.**

In [3]:
(musician.filter(musician['m_name'] != 'James First')
 .join(musician.filter(musician['m_name']=='James First')
       .join(place, on=(musician['born_in']==place['place_no']))
       .select('born_in', 'place_town'),
       on='born_in')
 .select('m_name', 'place_town')
 .orderBy('m_name')
 .toPandas())

                                                                                

Unnamed: 0,m_name,place_town
0,Alan Fluff,London
1,Andy Jones,London
2,Theo Mengel,London


## 12.
**Create a list showing for EVERY musician born in Britain the number of compositions and the number of instruments played.**

In [4]:
t = (musician
     .join(place.filter(place['place_country'].isin('England', 'Scotland')),
           on=(musician['born_in']==place['place_no']), how='left')
     .join(composer, on=(musician['m_no']==composer['comp_is']), how='left')
     .join(has_composed, on=(composer['comp_no']==has_composed['cmpr_no']), how='left')
     .join(composition, on=(has_composed['cmpn_no']==composition['c_no']), how='left')
     .groupBy('m_no', 'm_name')
     .agg(count('c_no').alias('n_composition')))
p = (musician.join(performer, on=(musician['m_no']==performer['perf_is']), how='left')
     .select('m_no', 'instrument')
     .distinct()
     .groupBy('m_no')
     .agg(count('instrument').alias('n_instrument')))
t.join(p, on='m_no').orderBy('m_name').toPandas()

Unnamed: 0,m_no,m_name,n_composition,n_instrument
0,14,Alan Fluff,0,2
1,19,Andy Jones,1,0
2,12,Davis Heavan,0,3
3,18,Elsie James,0,3
4,1,Fred Bloggs,2,0
5,17,Freda Miles,2,0
6,4,Harriet Smithson,0,2
7,8,Harry Forte,2,3
8,3,Helen Smyth,1,1
9,5,James First,4,1


## 13.
**Give the band name, conductor and contact of the bands performing at the most recent concert in the Royal Albert Hall.**

In [5]:
rah = (concert.filter(concert['concert_venue']=='Royal Albert Hall')
       .orderBy(col('con_date').desc())
       .limit(1))
(band
 .join(musician.withColumnRenamed('m_name', 'contact')
       .alias('mus1'), 
       on=(band['band_contact']==col('mus1.m_no')))
 .join(performance, on=(band['band_no']==performance['gave']))
 .join(musician.withColumnRenamed('m_name', 'conductor')
       .alias('mus2'),
       on=(performance['conducted_by']==col('mus2.m_no')))
 .join(rah, on=(performance['performed_in']==rah['concert_no']))
 .select('band_name', 'contact', 'conductor')
 .toPandas())

Unnamed: 0,band_name,contact,conductor
0,Somebody Loves this,Theo Mengel,Alan Fluff


## 14.
**Give a list of musicians associated with Glasgow. Include the name of the musician and the nature of the association - one or more of 'LIVES_IN', 'BORN_IN', 'PERFORMED_IN' AND 'IN_BAND_IN'.**

In [6]:
t1 = (musician.join(place, on=(musician['born_in']==place['place_no']))
      .select('m_name', 'place_town')
      .withColumn('assoc', lit('BORN_IN')))
t2 = (musician.join(place, on=(musician['living_in']==place['place_no']))
      .select('m_name', 'place_town')
      .withColumn('assoc', lit('LIVES_IN')))
t3 = (musician.join(performer, on=(musician['m_no']=='perf_is'))
      .join(plays_in, on=(performer['perf_no']==plays_in['player']))
      .join(performance, on=(plays_in['band_id']==performance['gave']))
      .join(concert, on=(performance['performed_in']==concert['concert_no']))
      .join(place, on=(concert['concert_in']==place['place_no']))
      .select('m_name', 'place_town')
      .withColumn('assoc', lit('PERFORMED_IN')))
t4 = (musician.join(plays_in, on=(musician['m_no']==plays_in['player']))
      .join(band, on=(plays_in['band_id']==band['band_no']))
      .join(place, on=(band['band_home']==place['place_no']))
      .select('m_name', 'place_town')
      .withColumn('assoc', lit('IN_BAND_IN')))
(t1.union(t2).union(t3).union(t4)
 .filter(col('place_town')=='Glasgow')
 .select('m_name', 'assoc')
 .distinct()
 .toPandas())

Unnamed: 0,m_name,assoc
0,Louise Simpson,BORN_IN
1,Steven Chaytors,BORN_IN
2,Lovely Time,BORN_IN
3,James Steeple,LIVES_IN
4,Andy Jones,LIVES_IN
5,Louise Simpson,LIVES_IN
6,Lovely Time,IN_BAND_IN
7,Freda Miles,IN_BAND_IN
8,Elsie James,IN_BAND_IN
9,Tony Smythe,IN_BAND_IN


## 15.
**Jeff Dawn plays in a band with someone who plays in a band with Sue Little. Who is it and what are the bands?**

In [7]:
sue_band = (musician.filter(musician['m_name']=='Sue Little')
            .join(performer, on=(musician['m_no']==performer['perf_is']))
            .join(plays_in, on=(performer['perf_no']==plays_in['player']))
            .join(band, on=(plays_in['band_id']==band['band_no']))
            .select('band_id', 'band_name'))
sue_coplayers = (musician
                 .join(performer, on=(musician['m_no']==performer['perf_is']))
                 .join(plays_in, on=(performer['perf_no']==plays_in['player']))
                 .join(sue_band, on='band_id'))
jeff_band = (musician.filter(musician['m_name']=='Jeff Dawn')
             .join(performer, on=(musician['m_no']==performer['perf_is']))
             .join(plays_in, on=(performer['perf_no']==plays_in['player']))
             .join(band, on=(plays_in['band_id']==band['band_no']))
             .select('band_id', 'band_name'))
jeff_coplayers = (musician
                 .join(performer, on=(musician['m_no']==performer['perf_is']))
                 .join(plays_in, on=(performer['perf_no']==plays_in['player']))
                 .join(jeff_band, on='band_id'))
(sue_coplayers.select(
    'm_no', col('band_name').alias('Sue\'s band'), 'm_name')
 .join(jeff_coplayers
       .select('m_no', col('band_name').alias('Jeff\'s band')), on='m_no')
 .select('m_name', 'Sue\'s band', 'Jeff\'s band')
 .toPandas())

Unnamed: 0,m_name,Sue's band,Jeff's band
0,John Smith,BBSO,AASO


In [8]:
sc.stop()