In [68]:
wordsDF = sqlContext.createDataFrame([('cat',), ('elephant',), ('rat',), ('rat',), ('cat', )], ['word'])
wordsDF.show()
print(type(wordsDF))
wordsDF.printSchema()

+--------+
|    word|
+--------+
|     cat|
|elephant|
|     rat|
|     rat|
|     cat|
+--------+

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- word: string (nullable = true)



In [69]:
wordsDF.show()

+--------+
|    word|
+--------+
|     cat|
|elephant|
|     rat|
|     rat|
|     cat|
+--------+



In [70]:
from pyspark.sql.functions import lit, concat
pluralDF = wordsDF.select(concat(wordsDF.word, lit('s')).alias('word'))
pluralDF.show()

+---------+
|     word|
+---------+
|     cats|
|elephants|
|     rats|
|     rats|
|     cats|
+---------+



In [71]:
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.select(length(pluralDF.word))
pluralLengthsDF.show()

+------------+
|length(word)|
+------------+
|           4|
|           9|
|           4|
|           4|
|           4|
+------------+



In [72]:
wordCountsDF = (wordsDF
                .groupBy(wordsDF.word)
                .count()
)
wordCountsDF.show()

+--------+-----+
|    word|count|
+--------+-----+
|     rat|    2|
|     cat|    2|
|elephant|    1|
+--------+-----+



In [73]:
uniqueWordsCount = wordCountsDF.count()
print(uniqueWordsCount)

3


In [83]:
averageCount = wordCountsDF.groupBy().mean().head()[0]
#wordCountsDF.groupBy().select(mean().alias("count"))
print(averageCount)

1.66666666667


In [75]:
def wordCount(wordListDF):
    return wordListDF.groupBy(wordListDF.word).count()
wordCount(wordsDF).show()

+--------+-----+
|    word|count|
+--------+-----+
|     rat|    2|
|     cat|    2|
|elephant|    1|
+--------+-----+



In [76]:
from pyspark.sql.functions import regexp_replace, trim, col, lower
def removePunctuation(column):
    return trim(regexp_replace(lower(column), '[^0-9a-z\s]', '')).alias('sentence')

sentenceDF = sqlContext.createDataFrame([('Hi, you!',),
                                         (' No under_score',),
                                         (' *      Remove punctuation and spaces   *   ',),
                                        ],['sentence'])
sentenceDF.show(truncate=False)
(sentenceDF
 .select(removePunctuation(col('sentence')))
 .show(truncate=False)
)

+--------------------------------------------+
|sentence                                    |
+--------------------------------------------+
|Hi, you!                                    |
| No under_score                             |
| *      Remove punctuation and spaces   *   |
+--------------------------------------------+

+-----------------------------+
|sentence                     |
+-----------------------------+
|hi you                       |
|no underscore                |
|remove punctuation and spaces|
+-----------------------------+



In [77]:
fileName = '100.txt.utf-8'
shakespeareDF = sqlContext.read.text(fileName).select(removePunctuation(col('value')))
shakespeareDF.show(15, truncate=False)

+-------------------------------------------------+
|sentence                                         |
+-------------------------------------------------+
|1609                                             |
|                                                 |
|the sonnets                                      |
|                                                 |
|by william shakespeare                           |
|                                                 |
|                                                 |
|                                                 |
|1                                                |
|from fairest creatures we desire increase        |
|that thereby beautys rose might never die        |
|but as the riper should by time decease          |
|his tender heir might bear his memory            |
|but thou contracted to thine own bright eyes     |
|feedst thy lights flame with selfsubstantial fuel|
+-------------------------------------------------+
only showing

In [79]:
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                .select(explode(split(shakespeareDF.sentence, ' ')).alias('word'))
                .where(col('word') != ''))
shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print(shakeWordsDFCount)

+-----------+
|       word|
+-----------+
|       1609|
|        the|
|    sonnets|
|         by|
|    william|
|shakespeare|
|          1|
|       from|
|    fairest|
|  creatures|
|         we|
|     desire|
|   increase|
|       that|
|    thereby|
|    beautys|
|       rose|
|      might|
|      never|
|        die|
+-----------+
only showing top 20 rows

902553


In [80]:
from pyspark.sql.functions import desc
topWordsAndCountsDF = wordCount(shakeWordsDF)
topWordsAndCountsDF.orderBy(col('count').desc()).show(truncate=False)

+----+-----+
|word|count|
+----+-----+
|the |27776|
|and |26755|
|i   |20681|
|to  |19235|
|of  |18246|
|a   |14653|
|you |13685|
|my  |12481|
|that|11123|
|in  |11018|
|is  |9601 |
|not |8736 |
|for |8239 |
|with|8037 |
|me  |7769 |
|it  |7690 |
|be  |7096 |
|your|6882 |
|this|6868 |
|his |6857 |
+----+-----+
only showing top 20 rows



In [81]:
topWordsAndCountsDF.show()

+----------+-----+
|      word|count|
+----------+-----+
|       art|  915|
|      some| 1338|
|     those|  546|
|     still|  552|
|  painters|    1|
|      hope|  355|
|    travel|   33|
|     cures|    8|
|    ransom|   53|
|     spoil|   25|
|   tresses|    3|
|       few|   66|
| forgetful|    5|
|    harder|   11|
|  tripping|    6|
| soundness|    1|
|    waters|   27|
|occidental|    1|
|    marrow|    4|
|    distil|    4|
+----------+-----+
only showing top 20 rows

