# Musicians- Medium

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app16-2') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
band = sc.read.table('sqlzoo.band')
composer = sc.read.table('sqlzoo.composer')
composition = sc.read.table('sqlzoo.composition')
concert = sc.read.table('sqlzoo.concert')
has_composed = sc.read.table('sqlzoo.has_composed')
musician = sc.read.table('sqlzoo.musician')
performance = sc.read.table('sqlzoo.performance')
performer = sc.read.table('sqlzoo.performer')
place = sc.read.table('sqlzoo.place')
plays_in = sc.read.table('sqlzoo.plays_in')

## 6.
**List the names, dates of birth and the instrument played of living musicians who play a instrument which Theo also plays.**

In [3]:
(musician.filter((musician['died'].isNull()) & 
                 (~musician['m_name'].like('Theo%')))
 .join(performer, on=(musician['m_no']==col('perf_is')))
 .join(musician.filter(musician['m_name'].like('Theo%'))
        .join(performer, on=(musician['m_no']==performer['perf_is']))
        .select('instrument'), on='instrument')
 .select('m_name', 'born', 'instrument')
 .orderBy('m_name')
 .toPandas())

                                                                                

Unnamed: 0,m_name,born,instrument
0,Harry Forte,1951-02-28,drums
1,Harry Forte,1951-02-28,violin
2,James First,1965-06-10,violin
3,Jeff Dawn,1945-12-12,violin
4,John Smith,1950-03-03,violin


## 7.
**List the name and the number of players for the band whose number of players is greater than the average number of players in each band.**

In [4]:
(band.join(plays_in, on=(band['band_no']==plays_in['band_id']))
 .join(performer, on=(plays_in['player']==performer['perf_no']))
 .select('band_name', 'perf_is')
 .distinct()
 .groupBy('band_name')
 .agg(count('perf_is').alias('n_mbr'))
 .withColumn('avg_mbr', avg('n_mbr').over(Window.partitionBy(lit(0))))
 .filter(col('n_mbr') > col('avg_mbr'))
 .select('band_name', 'n_mbr')
 .toPandas())

Unnamed: 0,band_name,n_mbr
0,ROP,7
1,AASO,7
2,Oh well,5


## 8.
**List the names of musicians who both conduct and compose and live in Britain.**

In [5]:
(musician.join(composer, on=(musician['m_no']==composer['comp_is']))
 .join(place.filter(place['place_country'].isin('England', 'Scotland')), 
       on=(musician['living_in']==place['place_no']))
 .join(performance, on=(musician['m_no']==performance['conducted_by']))
 .select('m_name')
 .distinct()
 .orderBy('m_name')
 .toPandas())

Unnamed: 0,m_name
0,Fred Bloggs
1,Jeff Dawn
2,Phil Hot
3,Rose Spring
4,Tony Smythe


## 9.
**Show the least commonly played instrument and the number of musicians who play it.**

In [6]:
(performer.join(plays_in, on=(performer['perf_no']==plays_in['player']))
 .join(performance, on=(plays_in['band_id']==performance['gave']))
 .groupBy('instrument')
 .agg(count('perf_no').alias('n_musician'))
 .orderBy('n_musician')
 .limit(1)
 .toPandas())

Unnamed: 0,instrument,n_musician
0,clarinet,1


## 10.
**List the bands that have played music composed by Sue Little; Give the titles of the composition in each case.**

In [7]:
(composition
 .join(has_composed, on=(composition['c_no']==has_composed['cmpn_no']))
 .join(composer, on=(has_composed['cmpr_no']==composer['comp_no']))
 .join(musician.filter(musician['m_name']=='Sue Little'),
       on=(composer['comp_is']==musician['m_no']))
 .select('c_no', 'c_title')
 .join(performance, on=(col('c_no')==performance['performed']))
 .join(band, on=(performance['gave']==band['band_no']))
 .select('band_name', 'c_title')
 .orderBy('band_name')
 .toPandas())

Unnamed: 0,band_name,c_title
0,BBSO,Slow Song
1,BBSO,Slow Symphony Blowing
2,Somebody Loves this,Slow Symphony Blowing
3,Swinging strings,Slow Song
4,The left Overs,Slow Song


In [8]:
sc.stop()