In [1]:
import pandas as pd
import numpy as np

In [51]:
data = {'year': [2010, 2011, 2012, 2011, None, 2010, 2011, 2012, 2013],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', None, 'Lions', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, None, 15, 11, 6, 10, 4, 3],
        'losses': [5, 8, 6, 1, 5, None, 6, 12, 13]}
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football

Unnamed: 0,year,team,wins,losses
0,2010.0,Bears,11.0,5.0
1,2011.0,Bears,8.0,8.0
2,2012.0,Bears,,6.0
3,2011.0,Packers,15.0,1.0
4,,,11.0,5.0
5,2010.0,Lions,6.0,
6,2011.0,Lions,10.0,6.0
7,2012.0,Lions,4.0,12.0
8,2013.0,Lions,3.0,13.0


## Pyspark

In [52]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("mode").getOrCreate()

In [53]:
ds = spark.createDataFrame(football)
ds.show()

+------+-------+----+------+
|  year|   team|wins|losses|
+------+-------+----+------+
|2010.0|  Bears|11.0|   5.0|
|2011.0|  Bears| 8.0|   8.0|
|2012.0|  Bears| NaN|   6.0|
|2011.0|Packers|15.0|   1.0|
|   NaN|   null|11.0|   5.0|
|2010.0|  Lions| 6.0|   NaN|
|2011.0|  Lions|10.0|   6.0|
|2012.0|  Lions| 4.0|  12.0|
|2013.0|  Lions| 3.0|  13.0|
+------+-------+----+------+



In [54]:
ds.printSchema()

root
 |-- year: double (nullable = true)
 |-- team: string (nullable = true)
 |-- wins: double (nullable = true)
 |-- losses: double (nullable = true)



In [55]:
for name, dtype in ds.dtypes:
    print(name, dtype)

year double
team string
wins double
losses double


In [66]:
for col, dtype in ds.dtypes:
    if dtype == "string":
        max_count = ds.groupby(col).count().select(max("count")).collect()[0][0]
        sum_count = ds.groupby(col).count().select(sum("count")).collect()[0][0]
        print(col, max_count, sum_count)

team 4 9


In [50]:
for col, dtype in ds.dtypes:
    if dtype != "string":
        dn = ds.select(col).na.drop()
        print("size:", dn.count())
        mean_col = dn.select(mean(col)).collect()[0][0]
        median_col = dn.stat.approxQuantile(col, [0.5],0.0)[0]
        std_col = dn.select(stddev(col)).collect()[0][0]
        print(col, mean_col, median_col, std_col)
        

size: 8
year 2011.25 2011.0 1.0350983390135584
size: 8
wins 8.5 8.0 4.035556254807296
size: 8
losses 7.0 6.0 3.9279220242478625


In [35]:
for col, dtype in ds.dtypes:
    if dtype != "string":
        mean_col = ds.select(mean(col)).collect()[0][0]
        median_col = ds.stat.approxQuantile("wins", [0.5],0.0)[0]
        std_col = ds.select(stddev(col)).collect()[0][0]
        print(col, mean_col, median_col, std_col)
        

year 2011.3333333333333 10.0 1.0000000000000056
wins 8.666666666666666 10.0 3.8078865529319543
losses 7.333333333333333 10.0 3.8078865529319543
