# SUM and COUNT

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession

sc = (SparkSession.builder.appName('app05') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## 1. Total world population

Show the total population of the world.

```
world(name, continent, area, population, gdp)
```

In [2]:
world = sc.read.table('sqlzoo.world')

In [3]:
from pyspark.sql import functions as F
world.agg({'population': 'sum'}).toPandas()

                                                                                

Unnamed: 0,sum(population)
0,7615649000.0


## 2. List of continents

List all the continents - just once each.

In [4]:
world.select('continent').dropDuplicates().toPandas()

Unnamed: 0,continent
0,Europe
1,Eurasia
2,Africa
3,North America
4,Caribbean
5,South America
6,Oceania
7,Asia
8,"Federated States of,Oceania"


## 3. GDP of Africa

Give the total GDP of Africa

In [5]:
world.filter(world['continent']=='Africa').agg({'gdp': 'sum'}).toPandas()

Unnamed: 0,sum(gdp)
0,1964824000000.0


## 4. Count the big countries

How many countries have an **area** of at least 1000000

In [6]:
world.filter(world['area']>1e6).count()

29

## 5. Baltic states population

What is the total **population** of ('Estonia', 'Latvia', 'Lithuania')

In [7]:
(world.filter(world['name'].isin(['Estonia', 'Latvia', 'Lithuania']))
 .agg({'population': 'sum'}).toPandas())

Unnamed: 0,sum(population)
0,6028631.0


## 6. Counting the countries of each continent

For each **continent** show the **continent** and number of countries.

> _Using GROUP BY and HAVING_   
> You may want to look at these examples: [Using GROUP BY and HAVING](https://sqlzoo.net/wiki/Using_GROUP_BY_and_HAVING.).

In [8]:
world.groupBy('continent').count().toPandas()

Unnamed: 0,continent,count
0,Europe,44
1,Eurasia,2
2,Africa,53
3,North America,11
4,Caribbean,11
5,South America,13
6,"Federated States of,Oceania",1
7,Oceania,13
8,Asia,47


## 7. Counting big countries in each continent

For each **continent** show the **continent** and number of countries with populations of at least 10 million.

In [9]:
(world.filter(world['population']>=1e7)
 .select('continent', 'name')
 .groupBy('continent')
 .count().toPandas())

Unnamed: 0,continent,count
0,Europe,15
1,Eurasia,1
2,Africa,31
3,North America,4
4,South America,8
5,Asia,28
6,Caribbean,3
7,Oceania,1


## 8. Counting big continents

List the continents that **have** a total population of at least 100 million.

In [10]:
bm = world.groupBy('continent').sum('population')
bm.filter(bm['sum(population)']>=1e8).select('continent').toPandas()

Unnamed: 0,continent
0,Europe
1,Eurasia
2,Africa
3,North America
4,South America
5,Asia


In [11]:
sc.stop()