# Spark Mllib

In [36]:
from __future__ import print_function
from pyspark import SparkContext
from pyspark.mllib.stat import Statistics
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql import SQLContext
from pyspark.ml.stat import ChiSquareTest
import pandas

In [3]:
sc = SparkContext(appName="Pspark mllib Example")
sqc = SQLContext(sc)

#### Load data

In [None]:
Create rdd of vectors, matrix, rdd

In [23]:
vecs_rdd = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
mat = RowMatrix(rows)
mat.numRows(), mat.numCols() 

(4, 3)

#### Summary Statistics

In [24]:
summary = Statistics.colStats(vecs_rdd)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

[5.5 6.5 7.5]
[15. 15. 15.]
[4. 4. 4.]


#### Correlations

In [25]:
print(Statistics.corr(vecs_rdd, method="pearson"))

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


In [33]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
df = sqc.createDataFrame(data, ["features"])
df.toPandas()

Unnamed: 0,features
0,"(1.0, 0.0, 0.0, -2.0)"
1,"[4.0, 5.0, 0.0, 3.0]"
2,"[6.0, 7.0, 0.0, 8.0]"
3,"(9.0, 0.0, 0.0, 1.0)"


In [35]:
r1 = Correlation.corr(df, 'features', 'pearson').head()
print("Pearson:\n" + str(r1[0]))

r2 = Correlation.corr(df, 'features', 'spearman').head()
print("Spearman:\n" + str(r2[0]))

Pearson:
DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])
Spearman:
DenseMatrix([[1.        , 0.10540926,        nan, 0.4       ],
             [0.10540926, 1.        ,        nan, 0.9486833 ],
             [       nan,        nan, 1.        ,        nan],
             [0.4       , 0.9486833 ,        nan, 1.        ]])


#### Hypothesis testing

In [None]:
data = [(0.0, Vectors.dense(0.5, 10.0)), (0.0, Vectors.dense(1.5, 20.0)), (1.0, Vectors.dense(1.5, 30.0)), (0.0, Vectors.dense(3.5, 30.0)), (0.0, Vectors.dense(3.5, 40.0)), (1.0, Vectors.dense(3.5, 40.0))]
df = sqc.createDataFrame(data, ["label", "features"])
df.toPandas()

In [None]:
r = ChiSquareTest.test(df, "features", "label").head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

In [None]:
sc.stop()

## Credits & Links

https://medium.com/@jaafarbenabderrazak.info/spark-for-machine-learning-using-python-and-mllib-435efdc3f708