In [1]:
# library imports
import pandas as pd
import pyspark
import findspark
findspark.init()
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from  pyspark.sql import functions as F

In [2]:
sc = sc('local', 'employee-survey')
spark = SparkSession(sc)

In [3]:
# import data
df = spark.read.csv('survey.csv', header=True)

In [4]:
# observe df
df.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+------------+-----------------------+---------------------+------------------+---------------+--------------------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|  supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|            comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+-------------

In [5]:
# drop NaN comments
df = df.filter(F.col('comments') != 'NA')

In [6]:
df = df.withColumn('Age', F.col('Age').cast('integer'))\
       .withColumn('Gender', F.lower(F.col('Gender')))\

df = df.withColumn('Gender', F.when(df.Gender == 'm', 'male')
                              .when(df.Gender == 'f', 'female')
                              .when(df.Gender == 'woman', 'female'))

df.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+------------+-----------------------+---------------------+------------------+---------------+--------------------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|  supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|            comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+-------------

In [12]:
df.select('comments').distinct().show()

+--------------------+
|            comments|
+--------------------+
|Some of these sho...|
|I'm diagnosed wit...|
|We don't fucking ...|
|Though it doesn't...|
|My company does p...|
|Many of these que...|
|Too many people e...|
|I burnt out this ...|
|I mostly suffer f...|
|When you are an i...|
|suffer from CR-PT...|
|Since being advis...|
|Thank you for you...|
|I have an excepti...|
|as a UK-based com...|
|My employer does ...|
|While I have not ...|
|Autism is a bitch...|
|I went through a ...|
|Hi Ed it's Paul D...|
+--------------------+
only showing top 20 rows



In [14]:
df = df.withColumn('user_id', F.monotonically_increasing_id())

df = df.select(*[
 'user_id',
 'Timestamp',
 'Age',
 'Gender',
 'Country',
 'state',
 'self_employed',
 'family_history',
 'treatment',
 'work_interfere',
 'no_employees',
 'remote_work',
 'tech_company',
 'benefits',
 'care_options',
 'wellness_program',
 'seek_help',
 'anonymity',
 'leave',
 'mental_health_consequence',
 'phys_health_consequence',
 'coworkers',
 'supervisor',
 'mental_health_interview',
 'phys_health_interview',
 'mental_vs_physical',
 'obs_consequence',
 'comments'
 ])

df = df.withColumn('user_id', df.user_id + 1)

In [22]:
df.filter(~F.isnull('comments')).show()

+-------+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+------------+-----------------------+---------------------+------------------+---------------+--------------------+
|user_id|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|  supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|            comments|
+-------+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+--

### Conduct String Cleaning

In [24]:
df2 = df.select(*['user_id', 'comments'])
df2.show()

+-------+--------------------+
|user_id|            comments|
+-------+--------------------+
|      1|I'm not on my com...|
|      2|I have chronic lo...|
|      3|My company does p...|
|      4|Relatively new jo...|
|      5|Sometimes I think...|
|      6|I selected my cur...|
|      7|Our health plan h...|
|      8|I just started a ...|
|      9|In addition to my...|
|     10|Thanks for doing ...|
|     11|In Russia we have...|
|     12|In my previous wo...|
|     13|I've seen negativ...|
|     14|I'm not a permane...|
|     15|I'd be more worri...|
|     16|Had a co-worker d...|
|     17|Family history of...|
|     18|I feel that my em...|
|     19|Many of these que...|
|     20|as a UK-based com...|
+-------+--------------------+
only showing top 20 rows



In [34]:
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import SentenceDetector, Tokenizer, StopWordsCleaner
from johnsnowlabs import nlp

In [30]:
from py4j.java_gateway import java_import
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")

In [32]:
# # raw text to document annotation
# doc_assembler = (DocumentAssembler()
#                  .setInputCol("comments")
#                  .setOutputCol("doc"))

In [36]:
nlp.load("en.stopwords").predict("I'm so fed up with this shit. Give me my VA money already!", output_level='sentence')

stopwords_iso download started this may take some time.


Exception: Something went wrong during creating the Spark NLP model_anno_obj for your request = en.stopwordsDid you use a NLU Spell?

In [37]:
nlp.load('emotion').predict('Wow that easy!')

classifierdl_use_emotion download started this may take some time.


Exception: Something went wrong during creating the Spark NLP model_anno_obj for your request = emotionDid you use a NLU Spell?