## Advanced Applications of Select

In [1]:
import pandas as pd
from dfply import *

In [6]:
!pip install pyspark



In [7]:
!pip install toolz



## Outline

* `select` helper functions
* Basic `filter` format

# Reading a `pandas.Dataframe` into `pyspark` 

Sometimes it is useful to preprocess data in `pandas` before reading the data frame into `pyspark`.  To make this work, use the `more_pyspark.get_spark_types` to get and pass along the `dtypes` defined in `pandas`.

#### Step 1.  Read/preprocess the data in `pandas`

In [2]:
survey_raw = pd.read_csv("./data/health_survey.csv")
survey_raw.columns

Index(['Unnamed: 0', 'F1', 'F5', 'F2', 'F1.1', 'F2.1', 'F6', 'F4', 'F3',
       'F5.1', 'F1.2', 'F2.2', 'F6.1', 'F2.3', 'F4.1', 'F2.4', 'F5.2', 'F2.5',
       'F6.2', 'F1.3', 'F2.6', 'F5.3', 'F4.2', 'F2.7', 'F3.1', 'F2.8', 'F5.4',
       'F3.2', 'F1.4', 'F3.3', 'F1.5', 'F5.5', 'F6.3', 'F1.6', 'F5.6', 'F2.9',
       'F3.4', 'F4.3', 'F2.10', 'F1.7', 'F6.4', 'F4.4', 'F5.7', 'F3.5',
       'F2.11'],
      dtype='object')

In [3]:
fix_name = lambda n: n.lower().replace('.', '_')
survey = (survey_raw >>
           select(~X['Unnamed: 0']) >>
           rename(**{fix_name(n):n for n in survey_raw.columns}) >>
           mutate(id = X.index))
survey.head(2)

Unnamed: 0,f1,f5,f2,f1_1,f2_1,f6,f4,f3,f5_1,f1_2,...,f3_4,f4_3,f2_10,f1_7,f6_4,f4_4,f5_7,f3_5,f2_11,id
0,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,...,Somewhat Disagree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,0
1,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Neither Agree nor Disagree,Neither Agree nor Disagree,Somewhat Agree,...,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Neither Agree nor Disagree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,1


#### Step 2. Convert the `dtypes` into `spark` types

In [9]:
from toolz import merge
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from more_pyspark import get_spark_types

spark_type = get_spark_types(survey)
spark_type

StructType([StructField('f1', StringType(), True), StructField('f5', StringType(), True), StructField('f2', StringType(), True), StructField('f1_1', StringType(), True), StructField('f2_1', StringType(), True), StructField('f6', StringType(), True), StructField('f4', StringType(), True), StructField('f3', StringType(), True), StructField('f5_1', StringType(), True), StructField('f1_2', StringType(), True), StructField('f2_2', StringType(), True), StructField('f6_1', StringType(), True), StructField('f2_3', StringType(), True), StructField('f4_1', StringType(), True), StructField('f2_4', StringType(), True), StructField('f5_2', StringType(), True), StructField('f2_5', StringType(), True), StructField('f6_2', StringType(), True), StructField('f1_3', StringType(), True), StructField('f2_6', StringType(), True), StructField('f5_3', StringType(), True), StructField('f4_2', StringType(), True), StructField('f2_7', StringType(), True), StructField('f3_1', StringType(), True), StructField('f2_

In [10]:
## Let's make that a bit more readable

from composable.string import replace
from composable import pipeable

str_ = pipeable(str)
print_ = pipeable(print)

spark_type = get_spark_types(survey)
_ = (spark_type 
     >> str_ 
     >> replace('StructField', '\n    StructField')
     >> print_)

StructType([
    StructField('f1', StringType(), True), 
    StructField('f5', StringType(), True), 
    StructField('f2', StringType(), True), 
    StructField('f1_1', StringType(), True), 
    StructField('f2_1', StringType(), True), 
    StructField('f6', StringType(), True), 
    StructField('f4', StringType(), True), 
    StructField('f3', StringType(), True), 
    StructField('f5_1', StringType(), True), 
    StructField('f1_2', StringType(), True), 
    StructField('f2_2', StringType(), True), 
    StructField('f6_1', StringType(), True), 
    StructField('f2_3', StringType(), True), 
    StructField('f4_1', StringType(), True), 
    StructField('f2_4', StringType(), True), 
    StructField('f5_2', StringType(), True), 
    StructField('f2_5', StringType(), True), 
    StructField('f6_2', StringType(), True), 
    StructField('f1_3', StringType(), True), 
    StructField('f2_6', StringType(), True), 
    StructField('f5_3', StringType(), True), 
    StructField('f4_2', StringTyp

#### Step 3. Create the `pyspark` data frame

In [11]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Ops').getOrCreate()

22/11/02 19:59:52 WARN Utils: Your hostname, jt7372wd222 resolves to a loopback address: 127.0.1.1; using 172.30.78.219 instead (on interface eth0)
22/11/02 19:59:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/02 19:59:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
survey_spark = spark.createDataFrame(survey ,schema=spark_type)
survey_spark

DataFrame[f1: string, f5: string, f2: string, f1_1: string, f2_1: string, f6: string, f4: string, f3: string, f5_1: string, f1_2: string, f2_2: string, f6_1: string, f2_3: string, f4_1: string, f2_4: string, f5_2: string, f2_5: string, f6_2: string, f1_3: string, f2_6: string, f5_3: string, f4_2: string, f2_7: string, f3_1: string, f2_8: string, f5_4: string, f3_2: string, f1_4: string, f3_3: string, f1_5: string, f5_5: string, f6_3: string, f1_6: string, f5_6: string, f2_9: string, f3_4: string, f4_3: string, f2_10: string, f1_7: string, f6_4: string, f4_4: string, f5_7: string, f3_5: string, f2_11: string, id: int]

# `pyspark` helpers from `more_pyspark`

I have created similar `select` helpers for `pyspark` similar to the `dfply` helpers such as `columns_from`.

In [13]:
from more_pyspark import all_but, col_startswith, col_endswith, col_contains, cols_to, cols_from, cols_between, to_pandas

## `pyspark` helper details

* Require the column names
* Two embedded forms
    * `col_startswith('f1',survey_spark.columns)`
    * `survey_spark.columns >>  col_startswith('f1')`
* Or just save first

#### Function call

In [15]:
(survey_spark
   .select(col_startswith('f1',survey_spark.columns))
   .take(2)
) >> to_pandas

Unnamed: 0,f1,f1_1,f1_2,f1_3,f1_4,f1_5,f1_6,f1_7
0,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree
1,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree


#### Embedded pipe

In [16]:
(survey_spark
.select(survey_spark.columns 
        >> col_startswith('f1'))
.take(2)
) >> to_pandas


Unnamed: 0,f1,f1_1,f1_2,f1_3,f1_4,f1_5,f1_6,f1_7
0,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree
1,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree


#### Save first

In [17]:

f1_cols = (survey_spark.columns >>  col_startswith('f1'))

(survey_spark
   .select(f1_cols)
   .take(2)
) >> to_pandas

Unnamed: 0,f1,f1_1,f1_2,f1_3,f1_4,f1_5,f1_6,f1_7
0,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree
1,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree


#### `all_but`

In [18]:
(survey_spark
.select(survey_spark.columns 
       >> all_but('f1'))
.take(2)
) >> to_pandas

Unnamed: 0,f5,f2,f1_1,f2_1,f6,f4,f3,f5_1,f1_2,f2_2,...,f3_4,f4_3,f2_10,f1_7,f6_4,f4_4,f5_7,f3_5,f2_11,id
0,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,...,Somewhat Disagree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,0
1,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Neither Agree nor Disagree,Neither Agree nor Disagree,Somewhat Agree,Neither Agree nor Disagree,...,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Neither Agree nor Disagree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,1


#### Composing `all_but`

In [19]:
has_underscore = survey_spark.columns >> col_contains('_')
no_underscore = survey_spark.columns >> all_but(has_underscore)
                 
(survey_spark.
   select(no_underscore).
   take(2)) >> to_pandas

Unnamed: 0,f1,f5,f2,f6,f4,f3,id
0,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree,0
1,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Neither Agree nor Disagree,1


#### Endwith

In [20]:
(survey_spark
   .select((survey_spark.columns 
            >> col_endswith('_2')))
   .take(2)) >> to_pandas

Unnamed: 0,f1_2,f2_2,f5_2,f6_2,f4_2,f3_2
0,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree
1,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree


#### Contains

In [21]:
(survey_spark
   .select((survey_spark.columns 
            >> col_contains('6_')))
   .take(2)) >> to_pandas

Unnamed: 0,f6_1,f6_2,f6_3,f6_4
0,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree
1,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Disagree


#### Columns up to

In [22]:
(survey_spark
   .select((survey_spark.columns 
            >> cols_to('f1_1')))
   .take(2)) >> to_pandas

Unnamed: 0,f1,f5,f2,f1_1
0,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree
1,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree


#### Columns from

In [23]:
(survey_spark
   .select((survey_spark.columns 
            >> cols_from('f6_4')))
   .take(2)) >> to_pandas

Unnamed: 0,f6_4,f4_4,f5_7,f3_5,f2_11,id
0,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,0
1,Somewhat Disagree,Neither Agree nor Disagree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,1


#### Columns between

In [24]:
(survey_spark
 .select((survey_spark.columns 
          >> cols_between('f1_1', 'f2_2')))
 .take(2)) >> to_pandas

Unnamed: 0,f1_1,f2_1,f6,f4,f3,f5_1,f1_2,f2_2
0,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree
1,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Neither Agree nor Disagree,Neither Agree nor Disagree,Somewhat Agree,Neither Agree nor Disagree


## <font color="red"> Exercise 6.1.1 </font> 

Use the `pyspark` `select` helper functions to create a table that contains all `F2` and `F3` questions.  


**Hint:** `select` helpers return intentions that turn into lists.  Think about how you can combine two lists.

In [51]:
# Your code here
f2 = survey_spark.columns >> col_contains('f2')
f3 = survey_spark.columns >> col_contains('f3')
contains_both = f2+f3
  
(survey_spark.
   select(contains_both).
   take(2)) >> to_pandas

Unnamed: 0,f2,f2_1,f2_2,f2_3,f2_4,f2_5,f2_6,f2_7,f2_8,f2_9,f2_10,f2_11,f3,f3_1,f3_2,f3_3,f3_4,f3_5
0,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree,Somewhat Agree,Somewhat Disagree,Somewhat Agree
1,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Somewhat Agree,Neither Agree nor Disagree,Neither Agree nor Disagree,Neither Agree nor Disagree,Neither Agree nor Disagree,Somewhat Agree,Neither Agree nor Disagree
