In [10]:
val sectorDf = spark.read
                    .format("csv")
                    .option("header", true)
                    .option("inferSchema", true)
                    .option("delimitter", ",")
                    .load("hdfs://localhost:9000/stocks/sectors")

sectorDf.printSchema()
sectorDf.show(2)

root
 |-- Company Name: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- ISIN Code: string (nullable = true)

+------------------+------------------+----------+------+------------+
|      Company Name|          Industry|    Symbol|Series|   ISIN Code|
+------------------+------------------+----------+------+------------+
|    Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
+------------------+------------------+----------+------+------------+
only showing top 2 rows



sectorDf: org.apache.spark.sql.DataFrame = [Company Name: string, Industry: string ... 3 more fields]


In [3]:
// create a schema for dataframe using scala
import org.apache.spark.sql.types.{StringType, StructType, DoubleType, 
                                   IntegerType, LongType, StructField }

// SectorSchema
val SectorSchema = StructType(
         List(
             StructField("CompanyName", StringType, true), // true nullable
             StructField("Industry", StringType, true),
             StructField("Symbol", StringType, true),
             StructField("Series", StringType, true),
             StructField("ISIN", StringType, true)
             )
    )

import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}
SectorSchema: org.apache.spark.sql.types.StructType = StructType(StructField(CompanyName,StringType,true), StructField(Industry,StringType,true), StructField(Symbol,StringType,true), StructField(Series,StringType,true), StructField(ISIN,StringType,true))


In [4]:
// Use the Schema
val sectorDf = spark.read
                    .format("csv")
                    .option("header", true)
                    .option("delimitter", ",")
                    .schema(SectorSchema)
                    .load("hdfs://localhost:9000/stocks/sectors")

sectorDf.printSchema()
sectorDf.show(2)

root
 |-- CompanyName: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- ISIN: string (nullable = true)

+------------------+------------------+----------+------+------------+
|       CompanyName|          Industry|    Symbol|Series|        ISIN|
+------------------+------------------+----------+------+------------+
|    Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
+------------------+------------------+----------+------+------------+
only showing top 2 rows



sectorDf: org.apache.spark.sql.DataFrame = [CompanyName: string, Industry: string ... 3 more fields]


In [5]:
sectorDf.columns

res3: Array[String] = Array(CompanyName, Industry, Symbol, Series, ISIN)


In [6]:
sectorDf.count()

res4: Long = 200


In [8]:
val df = sectorDf.select("Industry", "Symbol")
df.printSchema()
df.show(5)

root
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)

+------------------+----------+
|          Industry|    Symbol|
+------------------+----------+
|FINANCIAL SERVICES|  AXISBANK|
|FINANCIAL SERVICES|BAJFINANCE|
|FINANCIAL SERVICES|BAJAJFINSV|
|FINANCIAL SERVICES|  CHOLAFIN|
|FINANCIAL SERVICES|   HDFCAMC|
+------------------+----------+
only showing top 5 rows



df: org.apache.spark.sql.DataFrame = [Industry: string, Symbol: string]


In [15]:
// pick all industries, sort them in ascending order
// output has ... format, means column values are truncated by show method
sectorDf.select("Industry").distinct().sort("Industry").show()

+--------------------+
|            Industry|
+--------------------+
|          AUTOMOBILE|
|        CONSTRUCTION|
|      CONSUMER GOODS|
|  FINANCIAL SERVICES|
| HEALTHCARE SERVICES|
|INDUSTRIAL MANUFA...|
|                  IT|
|MEDIA ENTERTAINME...|
|              METALS|
|           OIL & GAS|
|              PHARMA|
+--------------------+



In [14]:
// shows full column name
sectorDf.select("Industry").distinct().sort("Industry").show(truncate = false)

+---------------------------------+
|Industry                         |
+---------------------------------+
|AUTOMOBILE                       |
|CONSTRUCTION                     |
|CONSUMER GOODS                   |
|FINANCIAL SERVICES               |
|HEALTHCARE SERVICES              |
|INDUSTRIAL MANUFACTURING         |
|IT                               |
|MEDIA ENTERTAINMENT & PUBLICATION|
|METALS                           |
|OIL & GAS                        |
|PHARMA                           |
+---------------------------------+



In [18]:
import org.apache.spark.sql.functions.{col, desc}
// sectorDf("Industry") represent col type
// descending order
sectorDf.select(sectorDf("Industry")).distinct().sort(desc("Industry")).show()

+--------------------+
|            Industry|
+--------------------+
|              PHARMA|
|           OIL & GAS|
|              METALS|
|MEDIA ENTERTAINME...|
|                  IT|
|INDUSTRIAL MANUFA...|
| HEALTHCARE SERVICES|
|  FINANCIAL SERVICES|
|      CONSUMER GOODS|
|        CONSTRUCTION|
|          AUTOMOBILE|
+--------------------+



import org.apache.spark.sql.functions.{col, desc}


In [19]:
import org.apache.spark.sql.functions.{col, desc}
// descending order
sectorDf.select(col("Industry")).distinct().sort(desc("Industry")).show()

+--------------------+
|            Industry|
+--------------------+
|              PHARMA|
|           OIL & GAS|
|              METALS|
|MEDIA ENTERTAINME...|
|                  IT|
|INDUSTRIAL MANUFA...|
| HEALTHCARE SERVICES|
|  FINANCIAL SERVICES|
|      CONSUMER GOODS|
|        CONSTRUCTION|
|          AUTOMOBILE|
+--------------------+



import org.apache.spark.sql.functions.{col, desc}


In [20]:
import org.apache.spark.sql.functions.{col, desc}
// descending order
// $ is a special symbol in scala for spark to represent column name
sectorDf.select($"Industry").distinct().sort(desc("Industry")).show()

+--------------------+
|            Industry|
+--------------------+
|              PHARMA|
|           OIL & GAS|
|              METALS|
|MEDIA ENTERTAINME...|
|                  IT|
|INDUSTRIAL MANUFA...|
| HEALTHCARE SERVICES|
|  FINANCIAL SERVICES|
|      CONSUMER GOODS|
|        CONSTRUCTION|
|          AUTOMOBILE|
+--------------------+



import org.apache.spark.sql.functions.{col, desc}
