In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SQLContext
from pyspark.sql import types, Row, functions

In [3]:
sc = SparkContext(appName="data-frame")
sql_context = SQLContext(sc)

In [4]:
rdd_file = sc.textFile('../data/98-0.txt')

In [5]:
schema = types.StructType([types.StructField('line', types.StringType())])
rdd_rows = rdd_file.map(lambda text: Row(line=text))
df_lines = sql_context.createDataFrame(rdd_rows, schema)
df_lines.show(5)

+--------------------+
|                line|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
|almost no restric...|
|re-use it under t...|
+--------------------+
only showing top 5 rows



In [6]:
def split_line(line):
    return line.split()


udf_split_line = functions.udf(split_line, types.ArrayType(types.StringType()))

In [7]:
df_split = df_lines.select(udf_split_line('line').alias('words'))
df_split.show(5)

+--------------------+
|               words|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
+--------------------+
only showing top 5 rows



In [8]:
df_explode = df_split.select(functions.explode('words').alias('word'))
df_explode.show(5)

+---------+
|     word|
+---------+
|      The|
|  Project|
|Gutenberg|
|    EBook|
|       of|
+---------+
only showing top 5 rows



In [9]:
df_grouped = df_explode.groupBy('word')\
    .agg(functions.count('*').alias('count'))\
    .orderBy('count', ascending=False)
df_grouped.show(5)

+----+-----+
|word|count|
+----+-----+
| the| 7514|
| and| 4745|
|  of| 4065|
|  to| 3458|
|   a| 2825|
+----+-----+
only showing top 5 rows

