# Partitioning by the ID column!

In [1]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
    
dataset = spark.read.csv("C:/Users/nikos/Desktop/partitiondata.csv", header=False).toDF("Id", "Text", "Latitude", "Longitude")

data = dataset.repartition(10,'Id')

print("Number of partitions: {}".format(data.rdd.getNumPartitions()))

example_partitions = data.rdd.glom().collect()
for i, l in enumerate(example_partitions): 
    print ("partition #{} length: {}".format(i, len(l)))


Number of partitions: 10
partition #0 length: 2184
partition #1 length: 0
partition #2 length: 1
partition #3 length: 0
partition #4 length: 0
partition #5 length: 4654
partition #6 length: 0
partition #7 length: 0
partition #8 length: 3161
partition #9 length: 0


# Partitioning by a column with random numbers!


In [2]:
from pyspark.sql.functions import udf, array
import numpy as np
from pyspark.sql.functions import concat, lit

randint = np.random.randint(0, 10, dataset.count())
from pyspark.ml.linalg import Vectors

import pandas as pd
from pyspark.sql.functions import monotonically_increasing_id
data = dataset.withColumn("IncreasingId", monotonically_increasing_id())

pdf = pd.DataFrame(randint, columns=['Rand'])
pdf.insert(0, 'IncreasingId', range(0, 0 + len(pdf)))

randomdata = spark.createDataFrame(pdf)


sqlContext.registerDataFrameAsTable(randomdata, 'randomdata')
sqlContext.registerDataFrameAsTable(data, 'data')

df2 = sqlContext.sql("SELECT data.Id, data.Text, data.Latitude, data.Longitude, randomdata.Rand FROM data INNER JOIN randomdata ON data.IncreasingId=randomdata.IncreasingId")


random_partition = df2.repartition(10,'Rand')

print("Number of partitions: {}".format(random_partition.rdd.getNumPartitions()))
#print("Partitions structure: {}".format(random_partition.rdd.glom().collect()))

partitions = random_partition.rdd.glom().collect()
for i, l in enumerate(partitions): 
    print ("partition #{} length: {}".format(i, len(l)))

#write data by partition in disk    
#random_partition.write.partitionBy('Rand').format("csv").save('C:/Users/nikos/Desktop/partitions')

Number of partitions: 10
partition #0 length: 1030
partition #1 length: 1030
partition #2 length: 0
partition #3 length: 0
partition #4 length: 0
partition #5 length: 1973
partition #6 length: 993
partition #7 length: 996
partition #8 length: 2039
partition #9 length: 1939


# Custom partitioning each number to the corresponding partition!

In [9]:
from pyspark.sql.functions import col, struct, spark_partition_id

mapping = {k: i for i, k in enumerate(
    df2.select("Rand").distinct().rdd.flatMap(lambda x: x).collect()
)}

result = (df2
    .select("Rand", struct([c for c in df2.columns]))
    .rdd.partitionBy(len(mapping), lambda k: mapping[k])
    .values()
    .toDF(df2.schema))

#result.withColumn("actual_partition_id", spark_partition_id()).show(10000)

print("Number of partitions: {}".format(result.rdd.getNumPartitions()))
#print("Partitions structure: {}".format(random_partition.rdd.glom().collect()))

partitions = result.rdd.glom().collect()
for i, l in enumerate(partitions): 
    print ("partition #{} length: {}".format(i, len(l)))


Number of partitions: 10
partition #0 length: 1012
partition #1 length: 958
partition #2 length: 955
partition #3 length: 1007
partition #4 length: 995
partition #5 length: 1008
partition #6 length: 1019
partition #7 length: 980
partition #8 length: 1051
partition #9 length: 1015


# Custom partitioning each number to the corresponding partition with duplicates!

In [16]:
from scipy.interpolate import griddata
from numpy.random import uniform
import math
import matplotlib.pyplot as plt
from pyspark.mllib.linalg import DenseVector
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

randint = np.random.randint(0, 10, dataset.count())
random = np.random.randint(0, 10, dataset.count())
merged = np.column_stack((randint,random))

pandasRandom = pd.DataFrame(randint, columns=['Rand'])

pandasRandom.insert(0, 'IncreasingId', range(0, 0 + len(pandasRandom)))

df = spark.createDataFrame(pandasRandom)

rdd = spark.sparkContext.parallelize(merged).map(lambda x: (x[0].tolist(), DenseVector(x[0:]))).toDF(["user_id", "features"])

firstElement=udf(lambda v:float(v[0]),FloatType())

res = sqlContext.createDataFrame(sc.emptyRDD(), rdd.schema)
res1 = rdd.select(firstElement("features"))

secondElement=udf(lambda v:float(v[1]),FloatType())

res2 = rdd.select(secondElement("features"))
rdd2 = res1.union(res2)

mapping = {k: i for i, k in enumerate(
    rdd2.select("<lambda>(features)").distinct().rdd.flatMap(lambda x: x).collect())}

result = (rdd2
    .select("<lambda>(features)", struct([c for c in rdd2.columns]))
    .rdd.partitionBy(len(mapping), lambda k: mapping[k])
    .values()
    .toDF(rdd2.schema))    

print("Number of partitions: {}".format(result.rdd.getNumPartitions()))
#print("Partitions structure: {}".format(random_partition.rdd.glom().collect()))

partitions = result.rdd.glom().collect()
for i, l in enumerate(partitions): 
    print ("partition #{} length: {}".format(i, len(l)))
    

Number of partitions: 10
partition #0 length: 2052
partition #1 length: 1952
partition #2 length: 1979
partition #3 length: 1928
partition #4 length: 2006
partition #5 length: 2036
partition #6 length: 2002
partition #7 length: 1997
partition #8 length: 2024
partition #9 length: 2024


In [7]:
import numpy as np
from scipy.interpolate import griddata

x, y = np.mgrid[-180:180:100j, -90:90:100j]
lat = np.arange(-140, 30, 0.5)
lon = np.arange(-150, 20, 0.5)
position = np.array((lat,lon)).T

ids = np.zeros(len(position))

for posdx, pos in enumerate(position): 
    
    for idx, i in enumerate(x[:,0]):
        if pos[0]<i:
            
            for jdx, j in enumerate(y[0]):
                #print(j)
                if pos[1]<j:
                    ids[posdx] = idx*len(x) + jdx
                    break
            
            break
print(ids)
print(len(x))
#print(position)
            
        
    
    
    



1200.0
100


# Partitioning by grid

In [6]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
import numpy as np
from scipy.interpolate import griddata
from pyspark.sql.functions import col, struct, spark_partition_id
import math
import itertools
import pandas as pd
from pyspark.sql.functions import monotonically_increasing_id
from itertools import chain

spark = SparkSession.builder.getOrCreate()

#read the csv file and fix the headers
dataset = spark.read.csv("C:/Users/nikos/Desktop/partitiondata.csv", header=False).toDF("Id", "Text", "Latitude", "Longitude")

#retain the latitude, longitude, replace the characters and convert to float type 
latitude = dataset.rdd.map(lambda x: float(x[2].replace('"(', ''))).collect()
longitude = dataset.rdd.map(lambda x: float(x[3].replace(')"', ''))).collect()

#make the coordinates of x and y axis, from the minimum to the maximum value, and split each axis to 11 values
#thus we have a 10x10 grid
x, y = np.mgrid[min(longitude):max(longitude):11j, min(latitude):max(latitude):11j]

#the id of each cell of the grid
ids = np.zeros(len(longitude))

#chech each point of the dataset
for k in range(0,len(longitude)): 
    
    #check where longitude is less or equal to x, for each x of the grid 
    for idx, i in enumerate(x[1:,0]):
        if longitude[k] <= i:
            
            #check where latitude is less or equal to y, for each y of the grid 
            for jdx, j in enumerate(y[0,1:]):
                if latitude[k] <= j:
                    
                    #compute the id of the independent cell
                    ids[k] = idx*(len(x)-1) + jdx
                   
                    break
            break

#create a pandas dataframe with the ids of the cell and an increasing number to each row, as column
gridPandas = pd.DataFrame(ids, columns=['gridID'])
gridPandas.insert(0, 'IncreasingId', range(0, 0 + len(gridPandas)))

#create a spark dataframe from the pandas dataframe
gridPartition = spark.createDataFrame(gridPandas)

#create a column to the dataframe "dataset" with an increasing number in each row
data = dataset.withColumn("IncreasingId", monotonically_increasing_id())

#name the two dataframes
sqlContext.registerDataFrameAsTable(gridPartition, 'gridPartition')
sqlContext.registerDataFrameAsTable(data, 'data')

#join two dataframes based on the increasing number that has created
df2 = sqlContext.sql("SELECT data.Id, data.Text, data.Latitude, data.Longitude, gridPartition.gridID FROM data INNER JOIN gridPartition ON data.IncreasingId=gridPartition.IncreasingId")

#retain the distinct values of the cells which correspond the points
#from the 10x10 grid, only 17 cells have points
mapping = {k: i for i, k in enumerate(
    df2.select("gridID").distinct().rdd.flatMap(lambda x: x).collect()
)}

#partition by the distinct cell id of the grid
result = (df2
    .select("gridID", struct([c for c in df2.columns]))
    .rdd.partitionBy(len(mapping), lambda k: mapping[k])
    .values()
    .toDF(df2.schema))

print("Number of partitions: {}".format(result.rdd.getNumPartitions()))

partitions = result.rdd.glom().collect()
for i, l in enumerate(partitions): 
    print ("partition #{} length: {}".format(i, len(l)))

#result.write.partitionBy('gridID').format("csv").save('C:/Users/nikos/Desktop/gridPartitions')

#function which computes the jaccard similarity
def jaccard_similarity(str1, str2): 
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

#function which computes the euclidean distance
def euclidean_distance(object1, object2):
    x1 = float(object1[2].replace('"(', ''))
    x2 = float(object2[2].replace('"(', ''))
    y1 = float(object1[3].replace(')"', ''))
    y2 = float(object2[3].replace(')"', ''))
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    return dist
    
#first function which computes for each point against all, the distance and the textual similarity
#retain these that have distance less or equal to theta, and textual similarity greater than e
def func(iterator):
    
    y = []
    
    for idx, obj in enumerate(iterator):
        
        x = []
        
        for objec in iterator:
            theta = 0.01
            e = 0.7
            
            if (objec!=obj) and (euclidean_distance(obj, objec) < theta) and (jaccard_similarity(obj[1], objec[1]) > e):
                
                x.append(objec[0])
                                 
        y.append(x)
    
    return y

#apply the first function to each partition
nearest_points = result.rdd \
        .mapPartitions(func) \
        .collect()
    
print("Nearest points for each partition: {}".format(nearest_points))

Number of partitions: 17
partition #0 length: 1659
partition #1 length: 1
partition #2 length: 72
partition #3 length: 52
partition #4 length: 69
partition #5 length: 2176
partition #6 length: 130
partition #7 length: 656
partition #8 length: 1622
partition #9 length: 1245
partition #10 length: 411
partition #11 length: 19
partition #12 length: 1157
partition #13 length: 195
partition #14 length: 462
partition #15 length: 73
partition #16 length: 1
Nearest points for each partition: [[' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005'], [], [' 1005', ' 1005', ' 1005', ' 1005', 

In [3]:
#second function that sorts the points based on the x axis, and compute the distance for each point from its next points
#until the distance is greater than theta, which stops the computation for this point against the others
def func(iterator):
    
    #retain the values of each partition to a pandas dataframe and sort them based on the longitude
    d = pd.DataFrame([p[0], p[1], float(p[2].replace('"(', '')), float(p[3].replace(')"', ''))] for p in iterator)
    d.columns = ['id', 'text', 'latitude', 'longitude']
    d.sort_values(by=['longitude'])
    
    points = [[] for x in range(len(d.index))]
    #check each point from its next points, until the distance is greater than theta
    for i, row1 in enumerate(d.iterrows()):
        for j, row2 in enumerate(d[i+1:].iterrows()):
            theta = 0.001
            e = 0.7
            if math.sqrt((row1[1][2] - row2[1][2])**2 + (row1[1][3] - row2[1][3])**2) < theta:
                    if jaccard_similarity(row1[1][1], row2[1][1]) > e:
                        points[i].append(row2[1][0])
                        points[j].append(row1[1][0])
            else:
                break
                                    
        
    return points

#apply the second function to each partition
nearest_points = result.rdd \
        .mapPartitions(func) \
        .collect()
    
print("Nearest points for each partition: {}".format(nearest_points))

Nearest points for each partition: [[' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', ' 1005', 

In [5]:
def func(iterator):

    def jaccard_similarity(str1, str2): 
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    
    y = []
    
    for idx, obj in enumerate(iterator):
        
        x = []
        similarity = []
        for objec in iterator:
            theta = 0.01
          
            if (objec!=obj) and (math.sqrt((float(obj[2].replace('"(', '')) - float(objec[2].replace('"(', '')))**2 + (float(obj[3].replace(')"', '')) - float(objec[3].replace(')"', '')))**2) < theta):
                text_sim = jaccard_similarity(obj[1], objec[1])
                
                x.append(objec[0])
                if text_sim >0.1:
                    similarity.append(text_sim)
                    
        y.append(len(x))
    
    return [y], similarity

nearest_points = result.rdd \
        .mapPartitions(func) \
        .collect()
    
print("Nearest points for each partition: {}".format(nearest_points))

Nearest points for each partition: [[[52]], [0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7142857142857

In [9]:
d = [[] for x in range(10)]

d[0].append(2)
d[0].append(22)
print(d)

[[2, 22], [], [], [], [], [], [], [], [], []]


In [41]:
from operator import itemgetter

def func(iterator):
    
    df1 = []
    w=[]
    q=[]
    for o in iterator:
        df1.append(o)
    
    df = sorted(df1, key=itemgetter(0))
    for idx, obj in enumerate(df):
        theta = 0.01
        
        for k in df[idx+1:]:
            
            if (k!=obj) and (math.sqrt((float(obj[2].replace('"(', '')) - float(k[2].replace('"(', '')))**2 + (float(obj[3].replace(')"', '')) - float(k[3].replace(')"', '')))**2) < theta):

                w.append(float(k[0].replace('"(', '')))
        if idx==0:
            break
    
    #q.append(len(w))
    return w

nearest_points = result.rdd \
        .mapPartitions(func) \
        .collect()
    
print("Nearest points for each partition: {}".format(nearest_points))

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 155.0 failed 1 times, most recent failure: Lost task 1.0 in stage 155.0 (TID 2804, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\nikos\Documents\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 230, in main
  File "C:\Users\nikos\Documents\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 225, in process
  File "C:\Users\nikos\Documents\spark-2.3.1-bin-hadoop2.7\python\pyspark\rdd.py", line 370, in func
    return f(iterator)
  File "<ipython-input-41-b65e66eea77e>", line 11, in func
TypeError: 'key' is an invalid keyword argument for this function

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:162)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor110.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\nikos\Documents\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 230, in main
  File "C:\Users\nikos\Documents\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 225, in process
  File "C:\Users\nikos\Documents\spark-2.3.1-bin-hadoop2.7\python\pyspark\rdd.py", line 370, in func
    return f(iterator)
  File "<ipython-input-41-b65e66eea77e>", line 11, in func
TypeError: 'key' is an invalid keyword argument for this function

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [45]:
x = np.array([[1,2],[3,4],[2,0]])
print(x[0][0])
#df = sorted(x, key=itemgetter(1))

#print(df)

1


In [23]:
#rddPair = df2.rdd.map(lambda x: (x[4], list(x[0:4]))).toDF()

#from itertools import islice

#result.rdd.mapPartitions(lambda it: islice(it, 0, 5)).count()


In [15]:
def sum_sales(iterator):
    yield sum(transaction[1]['amount'] for transaction in iterator)
    
sum_amounts = rdd1 \
        .mapPartitions(sum_sales) \
        .collect()
    
print("Total sales for each partition: {}".format(sum_amounts))

Total sales for each partition: [115, 326, 0]
