In [None]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark import SparkContext, SparkConf

spark_conf = SparkConf()\
  .setAppName("YourTest")\
  .setMaster("local[*]")

sc = SparkContext.getOrCreate(spark_conf)

In [None]:
from simple_tokenize import simple_tokenize

# Returns the count of distinct tokens in the `Shakespeare.txt` dataset
def count_distinct_tokens():
    # your solution to Question 2 here
  lines = sc.textFile('Shakespeare.txt')
  count = lines.flatMap(lambda line: simple_tokenize(line))\
               .distinct()\
               .count()
  return count
count_distinct_tokens()

In [None]:
from simple_tokenize import simple_tokenize
import itertools

# Returns the count of distinct pairs in the `Shakespeare.txt` dataset
def count_distinct_pairs():
    # your solution to Question 3 here
  lines = sc.textFile('Shakespeare.txt')
  pairs = lines.map(lambda line: simple_tokenize(line))\
               .map(lambda lst: list(set(lst)))\
               .flatMap(lambda lst: itertools.permutations(lst, 2))\
               .distinct()

  return pairs.count()

In [None]:
from simple_tokenize import simple_tokenize

# Returns a list of the top 50 (probability, count, token) tuples, ordered by probability
def top_50_tokens_probabilities():
    # your solution to Question 4 here 

  lines = sc.textFile('Shakespeare.txt')
  lines_count = lines.count()
  prob = lambda token_count: token_count[1]/lines_count
  swap = lambda x: (x[2], x[1], x[0])
  token_count_prob = lines.map(lambda line: simple_tokenize(line))\
                          .flatMap(lambda lst: list(set(lst)))\
                          .map(lambda word: (word, 1))\
                          .reduceByKey(lambda x,y: x+y)\
                          .map(lambda x: x+(prob(x),))\
                          .map(swap)\
                          .sortBy(lambda tuple: tuple[0],False)\
                          .take(50)

  return token_count_prob

In [None]:
from simple_tokenize import simple_tokenize
from math import log
import itertools

# Returns a list of tuples with the following format:
# ((token1, token2), pmi, co-occurrence_count, token1_count, token2_count)
def PMI(threshold):
    # your solution to Question 5 here
  lines = sc.textFile('Shakespeare.txt')
  lines_count = lines.count()
  prob = lambda pair_count: pair_count[1]/lines_count

  tokens_count_prob = lines.map(lambda line: simple_tokenize(line))\
                           .flatMap(lambda lst: list(set(lst)))\
                           .map(lambda word: (word, 1))\
                           .reduceByKey(lambda x,y: x+y)\
                           .map(lambda a: (a[0], (a[1], prob(a))))

# first .join(tokens_count_prob) created the form of (token1, ((token2, co-occurrence_count, p(x,y)), (token1_count, p(x))))
# second .join(tokens_count_prob) created the form of (token2, (token1, co-occurrence_count, p(x,y), token1_count, p(x), token2_count, p(y)))

  pairs_count = lines.map(lambda line: simple_tokenize(line))\
                     .map(lambda lst: list(set(lst)))\
                     .flatMap(lambda lst: itertools.permutations(lst, 2))\
                     .map(lambda pair: (pair, 1))\
                     .reduceByKey(lambda x,y: x+y)\
                     .filter(lambda pair_count: pair_count[1] >= threshold)\
                     .map(lambda x: x+(prob(x),))\
                     .map(lambda t: (t[0][0], (t[0][1], t[1], t[2])))\
                     .join(tokens_count_prob)\
                     .map(lambda t: (t[1][0][0], (t[0],) + t[1][0][1:] + t[1][1]))\
                     .join(tokens_count_prob)\
                     .map(lambda y: ((y[1][0][0], y[0]), y[1][0][1:] + y[1][1]))  # till here we have ((token1,token2), (co-occurrence_count, p(x,y), token1_count, p(x), token2_count, p(y)))
  
  pmi = lambda a: log(a[1][1] / (a[1][3] * a[1][5]))
  # (token1, token2), pmi, co-occurrence_count, token1_count, token2_count
  pmi_pair = pairs_count.map(lambda x: (x[0], pmi(x), x[1][0], x[1][2], x[1][4]))

  return pmi_pair.collect()

In [None]:
from simple_tokenize import simple_tokenize
from math import log

# Returns a list of samp_size tuples with the following format:
# (token, [ list_of_cooccurring_tokens ])
# where list_of_cooccurring_tokens is of the form
# [((token1, token2), pmi, cooc_count, token1_count, token2_count), ...]
def PMI_one_token(threshold, samp_size):
    # your solution to Question 6 here

  lines = sc.textFile('Shakespeare.txt')
  lines_count = lines.count()
  prob = lambda pair_count: pair_count[1]/lines_count

  tokens_count_prob = lines.map(lambda line: simple_tokenize(line))\
                           .flatMap(lambda lst: list(set(lst)))\
                           .map(lambda word: (word, 1))\
                           .reduceByKey(lambda x,y: x+y)\
                           .map(lambda a: (a[0], (a[1], prob(a))))

# first .join(tokens_count_prob) created the form of (token1, ((token2, co-occurrence_count, p(x,y)), (token1_count, p(x))))
# second .join(tokens_count_prob) created the form of (token2, (token1, co-occurrence_count, p(x,y), token1_count, p(x), token2_count, p(y)))

  pairs_count = lines.map(lambda line: simple_tokenize(line))\
                     .map(lambda lst: list(set(lst)))\
                     .flatMap(lambda lst: itertools.permutations(lst, 2))\
                     .map(lambda pair: (pair, 1))\
                     .reduceByKey(lambda x,y: x+y)\
                     .filter(lambda pair_count: pair_count[1] >= threshold)\
                     .map(lambda x: x+(prob(x),))\
                     .map(lambda t: (t[0][0], (t[0][1], t[1], t[2])))\
                     .join(tokens_count_prob)\
                     .map(lambda t: (t[1][0][0], (t[0],) + t[1][0][1:] + t[1][1]))\
                     .join(tokens_count_prob)\
                     .map(lambda y: ((y[1][0][0], y[0]), y[1][0][1:] + y[1][1]))  # till here we have ((token1,token2), (co-occurrence_count, p(x,y), token1_count, p(x), token2_count, p(y)))
  
  pmi = lambda a: log(a[1][1] / (a[1][3] * a[1][5]))
  # (token1, token2), pmi, co-occurrence_count, token1_count, token2_count
  pmi_pair = pairs_count.map(lambda x: (x[0], pmi(x), x[1][0], x[1][2], x[1][4]))


  # pull t1 as key
  pmi_samp = pmi_pair.map(lambda x: (x[0][0],(x)))\
                     .groupByKey()\
                     .mapValues(lambda tuples: [t for t in tuples if t])\
                     .takeSample(False, samp_size)

  return pmi_samp