In [None]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext(appName="YourTest", master="local[*]")

In [None]:
from spamminess import spamminess
from math import exp

def sequential_SGD(model, training_dataset='spam.train.group_x.txt', delta = 0.002):
    #### Your solution to Question 1 here
    # open one of the training files - defaults to group_x
    
    with open(training_dataset) as f:
        for line in f:
          doc = line.split()
          t = doc[1]
          F = doc[2:]
          F = [int(x) for x in F]

          score = spamminess(F,model)

          prob = 1.0/(1+exp(-score))
          for f in F:
            if t == 'spam':
              if f in model:
                model[f] += (1.0-prob)*delta
              else:
                model[f] = (1.0-prob)*delta
            elif t == 'ham':
              if f in model:
               model[f] -= prob*delta
              else:
                model[f] = - prob*delta


    return model


In [None]:
from spamminess import spamminess
from math import exp
import shutil, os

def spark_SGD(training_dataset='spam.train.group_x.txt', output_model='models/group_x_model', delta = 0.002):
    if os.path.isdir(output_model):
        shutil.rmtree(output_model) # Remove the previous model to create a new one
    
    training_data = sc.textFile(training_dataset)

    train = training_data.map(lambda x: x.split())\
                         .map(lambda y: y[1:])\
                         .coalesce(1)


    def f(iterator):
      w = {}

      for doc in iterator:
        t = doc[0]
        F = doc[1:]
        F = [int(x) for x in F]

        score = spamminess(F,w)     
        prob = 1.0/(1+exp(-score))
          

        for f in F:
          if t == 'spam':
            if f in w:
              w[f] += (1.0-prob)*0.002
            else:
              w[f] = (1.0-prob)*0.002
          elif t == 'ham':
            if f in w:
              w[f] -= prob*0.002
            else:
              w[f] = - prob*0.002

      yield w

    final = train.mapPartitions(f)\
                 .map(lambda x: [(k, v) for k, v in x.items()])\
                 .flatMap(lambda x: x)

    final.saveAsTextFile(output_model)


In [None]:
from spamminess import spamminess
from math import exp
import shutil, os, random
import random

def spark_shuffled_SGD(training_dataset='spam.train.group_x.txt', output_model='models/group_x_model', delta = 0.002):
    if os.path.isdir(output_model):
        shutil.rmtree(output_model) # Remove the previous model to create a new one

    training_data = sc.textFile(training_dataset)

    train = training_data.map(lambda x: x.split())\
                         .map(lambda y: y[1:])\
                         .coalesce(1)

    lines = train.count()

    def shuffle(iterator):
      indexed = []
      a = random.sample(range(lines), lines)
      i = 0
      for doc in iterator:
       indexed.append([a[i]] + [doc])
       i += 1
      yield indexed

    shuffled_train = train.mapPartitions(shuffle)\
                          .flatMap(lambda x: x)\
                          .repartition(lines)\
                          .sortByKey()\
                          .values()\
                          .coalesce(1)

    def f(iterator):
      w = {}

      for doc in iterator:
        t = doc[0]
        F = doc[1:]
        F = [int(x) for x in F]

        score = spamminess(F,w)     
        prob = 1.0/(1+exp(-score))
          

        for f in F:
          if t == 'spam':
            if f in w:
              w[f] += (1.0-prob)*0.002
            else:
              w[f] = (1.0-prob)*0.002
          elif t == 'ham':
            if f in w:
              w[f] -= prob*0.002
            else:
              w[f] = - prob*0.002

      yield w

    final = shuffled_train.mapPartitions(f)\
                          .map(lambda x: [(k, v) for k, v in x.items()])\
                          .flatMap(lambda x: x)

    final.saveAsTextFile(output_model)

In [None]:
from spamminess import spamminess
import shutil, os

def spark_classify(input_model='models/group_x_model', test_dataset='spam.test.qrels.txt', results_path='results/test_qrels'):
    if os.path.isdir(results_path):
        shutil.rmtree(results_path) # Remove the previous results

    test_data = sc.textFile(test_dataset)

    model_path = '{}/part-00000'.format(input_model)
    model = []
    with open(model_path) as f:
      for line in f:
        model.append(eval(line.strip()))

    model = dict(model)
    result = test_data.map(lambda x: x.split())\
                      .map(lambda x: x[:2] + [spamminess([int(i) for i in x[2:]], model)])\
                      .map(lambda x: x+['spam'] if x[-1]>0 else x+['ham'])\
                      .map(lambda x: tuple(x))

    result.saveAsTextFile(results_path)