In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
spark_session = SparkSession.builder.appName('simple_statistics').getOrCreate()

Loading the data

In [None]:
df = spark_session.read.csv('data/googleplaystore.csv',header=True)

In [None]:
df.columns

## Visualising the data

Firstly, I take a look at the data to familiarise with it. It seems that all the variables are categorical, even though I was expecting some of them to be numerical (like size, installs or rating). 

In [None]:
df.select('Category').distinct().collect()

In [None]:
# The rating is stored as a astring. Some non-numerical values
df.select('Rating').distinct().collect()

In [None]:
# This is a foreign key that points at another table
df.select('Reviews').collect()[0:10]

In [None]:
# As strings, with a suffix which indicates the unit
df.select('Size').collect()[0:10]

In [None]:
# Another categorical variable, instead of numerical, as I first expected
df.select('Installs').distinct().collect()

In [None]:
df.select('Type').distinct().collect()

In [None]:
# Another variable which I expected to be numerical
df.select('Price').distinct().collect()

In [None]:
df.select('Content rating').distinct().collect()

In [None]:
df.select('Genres').distinct().collect()

In [None]:
df.select('Last updated').collect()[0:10]

In [None]:
df.select('Current ver').collect()[0:10]

In [None]:
df.select('Android ver').distinct().collect()

## Visualising proportion of missing values per column

The only columns with missing values are `Content Rating`, `Current Ver` and `Android Ver`. However. the proportion of missing values is very low. 

In [None]:
total = df.count()
for c in df.columns:
    print(c)
    missing = df.filter(F.col(c).isNull()).count()
    print(str(missing/total*100) + '%')
    
    print(' ')

## Frequency tables for the categorical variables

Category, rating, installs, type, price, content rating, genres and android ver

## Statistics for the numerical variables

This step requires that the values are parsed and transformed into numerical values. I will do this step for rating, reviews and size. I am also doing this per category.

In [None]:
mean = df.select(F.mean(F.col('Size')).alias('mean')).collect()[0]['mean']
std = df.select(F.stddev(F.col('Size')).alias('std')).collect()[0]['std']
print('Size: mean = ' + str(mean) + ', std = ' + str(std))