In [1]:
import os 
user_name = os.environ.get('USER')

import random
port_name = random.randint(4000, 4999)
port_name

from pyspark.sql import SparkSession

spark = SparkSession \
.builder \
.master('yarn-client') \
.config('spark.driver.memory', '1g') \
.config('spark.executor.memory', '1g') \
.config('spark.ui.port', port_name) \
.appName(f'ml_survey_{user_name}') \
.getOrCreate()

In [4]:
table_name = "survey"
spark.sql(f"use {user_name}")
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|lksiezak|     brac|      false|
|lksiezak|   survey|      false|
+--------+---------+-----------+



In [10]:
original_df = spark.sql(f"SELECT *, CAST(ConvertedComp>60000 as string) AS CompAboveAvg\
                          FROM {table_name}\
                          WHERE ConvertedComp IS NOT NULL")

In [11]:
original_df.limit(5).toPandas()

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase,CompAboveAvg
0,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult,False
1,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy,True
2,6,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Canada,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,...,Tech articles written by other developers;Indu...,28.0,Man,No,Straight / Heterosexual,East Asian,No,Too long,Neither easy nor difficult,True
3,9,I am a developer by profession,Yes,Once a month or more often,The quality of OSS and closed source software ...,Employed full-time,New Zealand,No,Some college/university study without earning ...,"Computer science, computer engineering, or sof...",...,,23.0,Man,No,Bisexual,White or of European descent,No,Appropriate in length,Neither easy nor difficult,True
4,10,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,...,Tech articles written by other developers;Tech...,,,,,,Yes,Too long,Difficult,False


In [18]:
feature_columns = ["OpSys", "EdLevel", "MainBranch", "Country", "Student", "YearsCode"]
y = "CompAboveAvg"

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

In [21]:
stringindexer_stages = []

for c in feature_columns:
    stringindexer_stages.append(StringIndexer(inputCol=c, outputCol="strindex_"+c).setHandleInvalid("keep"))
    
stringindexer_stages.append(StringIndexer(inputCol=y, outputCol="label").setHandleInvalid("keep"))

In [23]:
stringindexer_stages_same = [StringIndexer(inputCol=c, outputCol="strindex_"+c).setHandleInvalid("keep") for c in feature_columns] 
stringindexer_stages_same.append(StringIndexer(inputCol=y, outputCol="label").setHandleInvalid("keep"))

[StringIndexer_cb97861cc62d,
 StringIndexer_412ecc3bdd26,
 StringIndexer_43a981f82e14,
 StringIndexer_e98c76f5f388,
 StringIndexer_d6b3f1b4358a,
 StringIndexer_ad0601683ebc]

In [24]:
Pipeline(stages=stringindexer_stages).fit(original_df).transform(original_df).toPandas()

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,SurveyLength,SurveyEase,CompAboveAvg,strindex_OpSys,strindex_EdLevel,strindex_MainBranch,strindex_Country,strindex_Student,strindex_YearsCode,label
0,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Appropriate in length,Neither easy nor difficult,false,2.0,0.0,1.0,57.0,0.0,10.0,0.0
1,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Appropriate in length,Easy,true,0.0,0.0,0.0,0.0,0.0,10.0,1.0
2,6,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Canada,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,...,Too long,Neither easy nor difficult,true,0.0,0.0,1.0,4.0,0.0,13.0,1.0
3,9,I am a developer by profession,Yes,Once a month or more often,The quality of OSS and closed source software ...,Employed full-time,New Zealand,No,Some college/university study without earning ...,"Computer science, computer engineering, or sof...",...,Appropriate in length,Neither easy nor difficult,true,1.0,2.0,0.0,31.0,0.0,8.0,1.0
4,10,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,...,Too long,Difficult,false,0.0,1.0,0.0,2.0,0.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55818,88878,I am a developer by profession,Yes,Less than once per year,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Appropriate in length,Easy,true,1.0,0.0,0.0,0.0,0.0,8.0,1.0
55819,88879,I am a developer by profession,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Finland,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",...,Appropriate in length,Easy,true,0.0,1.0,0.0,30.0,0.0,18.0,1.0
55820,88881,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Austria,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",...,Appropriate in length,Easy,true,1.0,1.0,0.0,17.0,0.0,14.0,1.0
55821,88882,I am a developer by profession,Yes,Never,"OSS is, on average, of LOWER quality than prop...",Employed full-time,Netherlands,"Yes, full-time","Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",...,Too long,Easy,true,2.0,1.0,0.0,10.0,1.0,0.0,1.0


In [26]:
oneHotEncoder_stages = [OneHotEncoder(inputCol="strindex_"+c, outputCol="onehot_"+c) for c in feature_columns]


In [None]:
Pipeline(stages)