In [115]:
import findspark
findspark.init()

In [116]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, NGram
from pyspark.sql.functions import split, explode

In [117]:
spark = SparkSession \
    .builder \
    .appName("assignment1 pyspark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [118]:
df=spark.read.text('kjvdat.txt')

In [119]:
df.show(20)

+--------------------+
|               value|
+--------------------+
|Gen|1|1| In the b...|
|Gen|1|2| And the ...|
|Gen|1|3| And God ...|
|Gen|1|4| And God ...|
|Gen|1|5| And God ...|
|Gen|1|6| And God ...|
|Gen|1|7| And God ...|
|Gen|1|8| And God ...|
|Gen|1|9| And God ...|
|Gen|1|10| And God...|
|Gen|1|11| And God...|
|Gen|1|12| And the...|
|Gen|1|13| And the...|
|Gen|1|14| And God...|
|Gen|1|15| And let...|
|Gen|1|16| And God...|
|Gen|1|17| And God...|
|Gen|1|18| And to ...|
|Gen|1|19| And the...|
|Gen|1|20| And God...|
+--------------------+
only showing top 20 rows



In [120]:
df1=df.withColumn('sentence',split(df['value'],"\|")[3])

In [121]:
df1.show(10)

+--------------------+--------------------+
|               value|            sentence|
+--------------------+--------------------+
|Gen|1|1| In the b...| In the beginning...|
|Gen|1|2| And the ...| And the earth wa...|
|Gen|1|3| And God ...| And God said, Le...|
|Gen|1|4| And God ...| And God saw the ...|
|Gen|1|5| And God ...| And God called t...|
|Gen|1|6| And God ...| And God said, Le...|
|Gen|1|7| And God ...| And God made the...|
|Gen|1|8| And God ...| And God called t...|
|Gen|1|9| And God ...| And God said, Le...|
|Gen|1|10| And God...| And God called t...|
+--------------------+--------------------+
only showing top 10 rows



In [122]:
tokenizer = RegexTokenizer(inputCol="sentence", outputCol="words",pattern='\\W')
tokenized = tokenizer.transform(df1)

In [123]:
tokenized.show(10)

+--------------------+--------------------+--------------------+
|               value|            sentence|               words|
+--------------------+--------------------+--------------------+
|Gen|1|1| In the b...| In the beginning...|[in, the, beginni...|
|Gen|1|2| And the ...| And the earth wa...|[and, the, earth,...|
|Gen|1|3| And God ...| And God said, Le...|[and, god, said, ...|
|Gen|1|4| And God ...| And God saw the ...|[and, god, saw, t...|
|Gen|1|5| And God ...| And God called t...|[and, god, called...|
|Gen|1|6| And God ...| And God said, Le...|[and, god, said, ...|
|Gen|1|7| And God ...| And God made the...|[and, god, made, ...|
|Gen|1|8| And God ...| And God called t...|[and, god, called...|
|Gen|1|9| And God ...| And God said, Le...|[and, god, said, ...|
|Gen|1|10| And God...| And God called t...|[and, god, called...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [124]:
words_list=tokenized.select(explode(tokenized['words']))

In [125]:
words_list.show(10)

+---------+
|      col|
+---------+
|       in|
|      the|
|beginning|
|      god|
|  created|
|      the|
|   heaven|
|      and|
|      the|
|    earth|
+---------+
only showing top 10 rows



In [126]:
wf_pairs=words_list.groupby('col').count()

In [127]:
wf_pairs.show(10)

+---------+-----+
|      col|count|
+---------+-----+
|   waters|  287|
|      art|  494|
|    those|  465|
| ashkenaz|    1|
|    serug|    5|
|    still|  101|
|     some|  232|
|     hazo|    1|
|  pitcher|   12|
|destitute|    8|
+---------+-----+
only showing top 10 rows



In [128]:
wf_pairs.sort("col").show(10)

+---------+-----+
|      col|count|
+---------+-----+
|        a| 8177|
|    aaron|  350|
|aaronites|    2|
|  abaddon|    1|
|  abagtha|    1|
|    abana|    1|
|   abarim|    4|
|    abase|    4|
|   abased|    4|
|  abasing|    1|
+---------+-----+
only showing top 10 rows



In [129]:
wf_pairs.sort("count", ascending=False).show(10)

+-----+-----+
|  col|count|
+-----+-----+
|  the|63924|
|  and|51696|
|   of|34617|
|   to|13562|
| that|12912|
|   in|12667|
|   he|10420|
|shall| 9838|
| unto| 8997|
|  for| 8971|
+-----+-----+
only showing top 10 rows



In [130]:
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(tokenized)

In [131]:
ngramDataFrame.show(10)

+--------------------+--------------------+--------------------+--------------------+
|               value|            sentence|               words|              ngrams|
+--------------------+--------------------+--------------------+--------------------+
|Gen|1|1| In the b...| In the beginning...|[in, the, beginni...|[in the, the begi...|
|Gen|1|2| And the ...| And the earth wa...|[and, the, earth,...|[and the, the ear...|
|Gen|1|3| And God ...| And God said, Le...|[and, god, said, ...|[and god, god sai...|
|Gen|1|4| And God ...| And God saw the ...|[and, god, saw, t...|[and god, god saw...|
|Gen|1|5| And God ...| And God called t...|[and, god, called...|[and god, god cal...|
|Gen|1|6| And God ...| And God said, Le...|[and, god, said, ...|[and god, god sai...|
|Gen|1|7| And God ...| And God made the...|[and, god, made, ...|[and god, god mad...|
|Gen|1|8| And God ...| And God called t...|[and, god, called...|[and god, god cal...|
|Gen|1|9| And God ...| And God said, Le...|[and, god, 

In [132]:
ngram_list=ngramDataFrame.select(explode(ngramDataFrame['ngrams']))

In [133]:
ngram_list.show(10)

+-------------+
|          col|
+-------------+
|       in the|
|the beginning|
|beginning god|
|  god created|
|  created the|
|   the heaven|
|   heaven and|
|      and the|
|    the earth|
|      and the|
+-------------+
only showing top 10 rows



In [134]:
ngram_list_with_first_word=ngram_list.withColumn('first_word',split(ngram_list['col']," ")[0])

In [135]:
ngram_list_with_first_word.show(10)

+-------------+----------+
|          col|first_word|
+-------------+----------+
|       in the|        in|
|the beginning|       the|
|beginning god| beginning|
|  god created|       god|
|  created the|   created|
|   the heaven|       the|
|   heaven and|    heaven|
|      and the|       and|
|    the earth|       the|
|      and the|       and|
+-------------+----------+
only showing top 10 rows



In [136]:
ngram_list_count=ngram_list_with_first_word.groupby('first_word','col').count().sort('first_word')
ngram_list_count=ngram_list_count.withColumnRenamed("col", "words_pair")

In [137]:
ngram_list_count.show(10)

+----------+----------+-----+
|first_word|words_pair|count|
+----------+----------+-----+
|         a|   a tenth|    6|
|         a|a cheerful|    2|
|         a|  a prayer|    2|
|         a|    a gift|   14|
|         a|   a tower|    7|
|         a| a special|    1|
|         a|   a young|   46|
|         a|a minstrel|    1|
|         a|    a wild|   10|
|         a|   a white|    7|
+----------+----------+-----+
only showing top 10 rows



In [138]:
first_word_count=ngram_list_with_first_word.groupby('first_word').count().sort('first_word')
first_word_count=first_word_count.withColumnRenamed("count", "first_word_count")

In [139]:
first_word_count.show(10)

+----------+----------------+
|first_word|first_word_count|
+----------+----------------+
|         a|            8177|
|     aaron|             338|
| aaronites|               2|
|   abaddon|               1|
|   abagtha|               1|
|     abana|               1|
|    abarim|               4|
|     abase|               3|
|    abased|               4|
|   abasing|               1|
+----------+----------------+
only showing top 10 rows



In [140]:
final_df=first_word_count.join(ngram_list_count,'first_word','inner')

In [141]:
final_df.show(10)

+----------+----------------+----------+-----+
|first_word|first_word_count|words_pair|count|
+----------+----------------+----------+-----+
|         a|            8177|    a wild|   10|
|         a|            8177|   a white|    7|
|         a|            8177| a special|    1|
|         a|            8177|a minstrel|    1|
|         a|            8177|  a mother|    3|
|         a|            8177|   a tenth|    6|
|         a|            8177|a cheerful|    2|
|         a|            8177|  a prayer|    2|
|         a|            8177|   a young|   46|
|         a|            8177|   a tower|    7|
+----------+----------------+----------+-----+
only showing top 10 rows



In [142]:
final_df.withColumn('prob',final_df['count']/final_df['first_word_count']).sort('words_pair').show(20)

+----------+----------------+-------------+-----+--------------------+
|first_word|first_word_count|   words_pair|count|                prob|
+----------+----------------+-------------+-----+--------------------+
|         a|            8177|    a babbler|    1|1.222942399412987...|
|         a|            8177|       a babe|    1|1.222942399412987...|
|         a|            8177| a backbiting|    1|1.222942399412987...|
|         a|            8177|a backsliding|    1|1.222942399412987...|
|         a|            8177|        a bad|    2|2.445884798825975...|
|         a|            8177|        a bag|    3|3.668827198238963E-4|
|         a|            8177|    a balance|    1|1.222942399412987...|
|         a|            8177|       a ball|    1|1.222942399412987...|
|         a|            8177|       a band|    8|9.783539195303901E-4|
|         a|            8177|       a bank|    3|3.668827198238963E-4|
|         a|            8177|     a banner|    2|2.445884798825975...|
|     

In [143]:
spark.stop()