In [1]:
import findspark
findspark.init()
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

In [2]:
names = ["kim","lee","park","lim"]
items = ["espresso","latte","americano","affocato","long black","macciato"]

In [3]:
coffeeDf = spark.createDataFrame([(names[i%4], items[i%6]) for i in range(500)],\
                           ["name","coffee"])

In [4]:
coffeeDf.describe().show()

+-------+----+--------+
|summary|name|  coffee|
+-------+----+--------+
|  count| 500|     500|
|   mean|null|    null|
| stddev|null|    null|
|    min| kim|affocato|
|    max|park|macciato|
+-------+----+--------+



In [5]:
coffeeDf.toPandas().describe()

Unnamed: 0,name,coffee
count,500,500
unique,4,6
top,park,espresso
freq,125,84


In [6]:
coffeeDf.groupBy('name').pivot('coffee').count().show()

+----+--------+---------+--------+-----+----------+--------+
|name|affocato|americano|espresso|latte|long black|macciato|
+----+--------+---------+--------+-----+----------+--------+
|park|    null|       42|      42| null|        41|    null|
| lim|      42|     null|    null|   42|      null|      41|
| kim|    null|       41|      42| null|        42|    null|
| lee|      41|     null|    null|   42|      null|      42|
+----+--------+---------+--------+-----+----------+--------+



In [7]:
coffeeDf.stat.crosstab("name", "coffee").show()

+-----------+--------+---------+--------+-----+----------+--------+
|name_coffee|affocato|americano|espresso|latte|long black|macciato|
+-----------+--------+---------+--------+-----+----------+--------+
|        lim|      42|        0|       0|   42|         0|      41|
|        lee|      41|        0|       0|   42|         0|      42|
|       park|       0|       42|      42|    0|        41|       0|
|        kim|       0|       41|      42|    0|        42|       0|
+-----------+--------+---------+--------+-----+----------+--------+



In [8]:
freq = coffeeDf.stat.freqItems(["name","coffee"], 0.5)
freq.show()

+--------------+-----------------+
|name_freqItems| coffee_freqItems|
+--------------+-----------------+
|   [lim, park]|[latte, espresso]|
+--------------+-----------------+



In [9]:
from pyspark.ml.feature import StringIndexer

labelIndexer = StringIndexer(inputCol="name", outputCol="label")
model=labelIndexer.fit(coffeeDf)
_coffeeDf=model.transform(coffeeDf)

In [10]:
_coffeeDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- coffee: string (nullable = true)
 |-- label: double (nullable = false)



In [11]:
_coffeeDf.show(4)

+----+---------+-----+
|name|   coffee|label|
+----+---------+-----+
| kim| espresso|  0.0|
| lee|    latte|  1.0|
|park|americano|  3.0|
| lim| affocato|  2.0|
+----+---------+-----+
only showing top 4 rows



In [12]:
featureIndexer = StringIndexer(inputCol="coffee", outputCol="_features")
model=featureIndexer.fit(_coffeeDf)
_coffeeDf=model.transform(_coffeeDf)

In [13]:

#from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(inputCols=["_features"], outputCol="features")
_coffeeDf = va.transform(_coffeeDf)

In [14]:
_coffeeDf.printSchema()
_coffeeDf.show(5)

root
 |-- name: string (nullable = true)
 |-- coffee: string (nullable = true)
 |-- label: double (nullable = false)
 |-- _features: double (nullable = false)
 |-- features: vector (nullable = true)

+----+----------+-----+---------+--------+
|name|    coffee|label|_features|features|
+----+----------+-----+---------+--------+
| kim|  espresso|  0.0|      0.0|   [0.0]|
| lee|     latte|  1.0|      1.0|   [1.0]|
|park| americano|  3.0|      3.0|   [3.0]|
| lim|  affocato|  2.0|      2.0|   [2.0]|
| kim|long black|  0.0|      4.0|   [4.0]|
+----+----------+-----+---------+--------+
only showing top 5 rows



In [15]:
from pyspark.ml.stat import ChiSquareTest
r = ChiSquareTest.test(_coffeeDf, "features", "label")

In [16]:
from pyspark.ml.stat import ChiSquareTest
r = r.head()

In [17]:
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

pValues: [0.0]
degreesOfFreedom: [15]
statistics: [500.0963855421687]
