In [2]:
from pyspark.sql import(
    functions,
    Row,
    SparkSession
)

In [3]:
spark = SparkSession.builder.appName('df_wordcount').getOrCreate()

In [4]:
df = spark.createDataFrame([
        Row(a=1,
            intlist=[1,2,3],
            mapfield={"a":"b"}
        )])

In [5]:
df.show()

+---+---------+--------+
|  a|  intlist|mapfield|
+---+---------+--------+
|  1|[1, 2, 3]|{a -> b}|
+---+---------+--------+



In [12]:
df.select(functions.explode(df.intlist).alias("anInt")).show()

+-----+
|anInt|
+-----+
|    1|
|    2|
|    3|
+-----+



In [6]:
df.select(functions.explode(df.intlist).alias("anInt")).collect()

[Row(anInt=1), Row(anInt=2), Row(anInt=3)]

In [13]:
df = spark.createDataFrame([
    Row(word = 'hello world and pyspark')])

In [16]:
df.select(functions.split(df.word,' ').alias("word")).show()

+--------------------+
|                word|
+--------------------+
|[hello, world, an...|
+--------------------+



In [17]:
df.select(functions.split(df.word,' ').alias("word")).collect()

[Row(word=['hello', 'world', 'and', 'pyspark'])]

In [18]:
csv_file_path = "file:///home/jovyan/work/sample/lorem_ipsum.txt"

In [20]:
df = spark.read.text(csv_file_path)

In [22]:
df.show()

+--------------------+
|               value|
+--------------------+
|Lorem ipsum dolor...|
|                    |
|Orci eu lobortis ...|
|                    |
|Vulputate enim nu...|
|                    |
|Sit amet nulla fa...|
|                    |
|Nibh cras pulvina...|
|                    |
|Arcu felis bibend...|
|                    |
|Vestibulum sed ar...|
|                    |
|Sit amet tellus c...|
|                    |
|Augue mauris augu...|
|                    |
|Pellentesque mass...|
|                    |
+--------------------+
only showing top 20 rows



In [24]:
df.select(functions.split(df.value, ' ').alias("word")).show()

+--------------------+
|                word|
+--------------------+
|[Lorem, ipsum, do...|
|                  []|
|[Orci, eu, lobort...|
|                  []|
|[Vulputate, enim,...|
|                  []|
|[Sit, amet, nulla...|
|                  []|
|[Nibh, cras, pulv...|
|                  []|
|[Arcu, felis, bib...|
|                  []|
|[Vestibulum, sed,...|
|                  []|
|[Sit, amet, tellu...|
|                  []|
|[Augue, mauris, a...|
|                  []|
|[Pellentesque, ma...|
|                  []|
+--------------------+
only showing top 20 rows



In [28]:
word = df.select(
    functions.explode(
    functions.split(df.value, ' ')) \
    .alias("word"))

In [29]:
word.show()

+-----------+
|       word|
+-----------+
|      Lorem|
|      ipsum|
|      dolor|
|        sit|
|      amet,|
|consectetur|
| adipiscing|
|      elit,|
|        sed|
|         do|
|    eiusmod|
|     tempor|
| incididunt|
|         ut|
|     labore|
|         et|
|     dolore|
|      magna|
|    aliqua.|
|         Et|
+-----------+
only showing top 20 rows



In [44]:
word_counts = word.groupby("word").count().orderBy(functions.col("count").desc())

In [45]:
word_counts.show()

+------------+-----+
|        word|count|
+------------+-----+
|         sed|  194|
|          in|  164|
|        amet|  149|
|         sit|  147|
|          ut|  140|
|        eget|  131|
|          id|  127|
|          at|  120|
|       vitae|  118|
|          et|  117|
|        nunc|  113|
|          eu|  108|
|         non|  102|
|            |   99|
|          ac|   97|
|      tellus|   97|
|        diam|   95|
|     viverra|   95|
|        enim|   93|
|pellentesque|   93|
+------------+-----+
only showing top 20 rows

