In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("pyspark-intro")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .getOrCreate()
)

In [3]:
spark.version

'2.4.5'

### Create Spark DataFrame

In [4]:
df = spark.read.csv("/dataset/yahoo-symbols-201709.csv", header=True)

In [5]:
df.count()

106328

In [6]:
df.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Exchange: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- Country: string (nullable = true)



In [7]:
df.show()

+------+--------------------+--------+--------------------+-------+
|Ticker|                Name|Exchange|       Category Name|Country|
+------+--------------------+--------+--------------------+-------+
|  OEDV|Osage Exploration...|     PNK|                null|    USA|
|  AAPL|          Apple Inc.|     NMS|Electronic Equipment|    USA|
|   BAC|Bank of America C...|     NYQ|  Money Center Banks|    USA|
|  AMZN|    Amazon.com, Inc.|     NMS|Catalog & Mail Or...|    USA|
|     T|           AT&T Inc.|     NYQ|Telecom Services ...|    USA|
|  GOOG|       Alphabet Inc.|     NMS|Internet Informat...|    USA|
|    MO|  Altria Group, Inc.|     NYQ|          Cigarettes|    USA|
|   DAL|Delta Air Lines, ...|     NYQ|      Major Airlines|    USA|
|    AA|   Alcoa Corporation|     NYQ|            Aluminum|    USA|
|   AXP|American Express ...|     NYQ|     Credit Services|    USA|
|    DD|E. I. du Pont de ...|     NYQ|Agricultural Chem...|    USA|
|  BABA|Alibaba Group Hol...|     NYQ|Specialty 

### DataFrame operations
Show Top 20 categories by quantity of stocks

In [8]:
cats = df.groupby(df['Category Name']).count()
cats.orderBy(cats['count'].desc()).show(truncate=False)

+-------------------------------+-----+
|Category Name                  |count|
+-------------------------------+-----+
|null                           |85600|
|Industrial Metals & Minerals   |1292 |
|Biotechnology                  |667  |
|Diversified Machinery          |608  |
|Asset Management               |602  |
|Money Center Banks             |536  |
|Independent Oil & Gas          |515  |
|Gold                           |451  |
|Business Services              |422  |
|Technical & System Software    |374  |
|Wireless Communications        |374  |
|General Contractors            |337  |
|Information Technology Services|319  |
|Diversified Utilities          |312  |
|Property Management            |289  |
|Communication Equipment        |276  |
|Real Estate Development        |269  |
|Auto Parts                     |260  |
|Diversified Electronics        |251  |
|Medical Appliances & Equipment |250  |
+-------------------------------+-----+
only showing top 20 rows



### Stop Drive Program
Release resources from Spark Cluster

In [9]:
spark.stop()