In [1]:
from pyspark.sql import (Row, SparkSession)
from pyspark.sql.functions import col, asc, desc

In [28]:
def parse_line(line: str):
    print(line)
    fields = line.split('|')
    print(fields[0])
    return Row(
        name=str(fields[0]),
        country=str(fields[1]),
        email=str(fields[2]),
        compensation=int(fields[3]))

In [29]:
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
lines = spark.sparkContext.textFile("file:///home/jovyan/work/sample/income.txt")

In [30]:
income_data = lines.map(parse_line)

In [33]:
print(income_data.take(1))

[Row(name='Kam Long', country='Dominica', email='VinThomas@example.taobao', compensation=137611)]


In [34]:
schema_income = spark.createDataFrame(data=income_data)

In [35]:
schema_income.createOrReplaceTempView("income")

In [39]:
medium_income_df = spark.sql(
    """select * from income where compensation >=70000
and compensation <= 100000""")

In [42]:
medium_income_df.show()

+------------------+--------------------+--------------------+------------+
|              name|             country|               email|compensation|
+------------------+--------------------+--------------------+------------+
|  Willian Cummings|             Senegal|    areus@test.canon|       77369|
|      Clarita Gill|             Ecuador| tomaslau@test.games|       86986|
| Walter Washington|          Kazakhstan|mbilderbach@examp...|       91072|
|       Lexie Banks|                Mali|unterdreht@test.date|       97933|
|        Luise Hunt|               Kenya|adellecharles@tes...|       96175|
|     Sebrina Walsh|         Puerto Rico|andrewcohen@examp...|       99276|
|      Josiah Lyons|              Malawi|nandini_m@test.ry...|       91768|
|      Temeka Grant|              Israel|terryxlife@test.g...|       71642|
|  Narcisa Saunders|Palestinian Terri...|raquelwilson@exam...|       77287|
|      Lisbeth Lane|          Azerbaijan|coreyweb@test.coffee|       82473|
|       Evan

In [43]:
schema_income.groupBy("country").count().orderBy(col("count").desc()).show()

+--------------------+-----+
|             country|count|
+--------------------+-----+
|           Australia|   10|
|           Singapore|    9|
|             Ecuador|    9|
|            Dominica|    9|
|          Madagascar|    9|
|           Nicaragua|    9|
|              Kuwait|    9|
|               Congo|    9|
|            Thailand|    9|
|             Senegal|    8|
|Sao Tome and Prin...|    8|
|Virgin Islands, B...|    8|
|              Zambia|    8|
|  Dominican Republic|    8|
|                Mali|    8|
|             Belgium|    7|
|Palestinian Terri...|    7|
|             Lesotho|    7|
|         Isle of Man|    7|
|             Bolivia|    7|
+--------------------+-----+
only showing top 20 rows



In [48]:
spark.sql("""select country, count(*) as count from income
group by country order by count(*) desc""").show()

+--------------------+-----+
|             country|count|
+--------------------+-----+
|           Australia|   10|
|           Singapore|    9|
|             Ecuador|    9|
|            Dominica|    9|
|          Madagascar|    9|
|           Nicaragua|    9|
|              Kuwait|    9|
|               Congo|    9|
|            Thailand|    9|
|             Senegal|    8|
|Sao Tome and Prin...|    8|
|Virgin Islands, B...|    8|
|              Zambia|    8|
|  Dominican Republic|    8|
|                Mali|    8|
|             Belgium|    7|
|Palestinian Terri...|    7|
|             Lesotho|    7|
|         Isle of Man|    7|
|             Bolivia|    7|
+--------------------+-----+
only showing top 20 rows

