In [0]:
file_path = "/Volumes/sql_workshop/sandbox/responses"
df = spark.read.option("delimiter", "\t").option("header", "true").csv(file_path)

In [0]:
# Dropping unwanted columns by index
columns_to_drop = [df.columns[1], df.columns[14], df.columns[15]]
df = df.drop(*columns_to_drop)

In [0]:
df = (df.withColumnRenamed("Timestamp", "timestamp")
       .withColumnRenamed("What most closely describes your current role?", "role")
       .withColumnRenamed("Please rate your experience level with the Databricks platform", "experience")
       .withColumnRenamed("What data warehousing activities apply most to your day-to-day job?", "activities")
       .withColumnRenamed("What do you hope to get out of this workshop?", "hope_to_get")
       .withColumnRenamed("Are there specific topics that you are interested in? If yes, please indicate below.", "topics")
       .withColumnRenamed("Do you have any other comments, feedback, or anything else you think we should know?", "comments")
       .withColumnRenamed("Please rate your technical knowledge of the following languages/technologies: [SQL]", "sql")
       .withColumnRenamed("Please rate your technical knowledge of the following languages/technologies: [Python]", "python")
       .withColumnRenamed("Please rate your technical knowledge of the following languages/technologies: [R]", "r")
       .withColumnRenamed("Please rate your technical knowledge of the following languages/technologies: [Spark]", "spark")
       .withColumnRenamed("Please rate your technical knowledge of the following languages/technologies: [Scala]12", "scala")
       .withColumnRenamed("Would you be remote or in-person?", "remote"))

display(df)

In [0]:
# Save df as delta table
df.write.format("delta").mode("overwrite").saveAsTable("sql_workshop.sandbox.responses_delta")

In [0]:
from pyspark.sql.functions import split, explode, col, trim, when

# Split the "role" column by "," and explode into their own row, removing leading/trailing spaces
df_roles = df.select(explode(split(trim(col("role")), ",")).alias("role")).withColumn("role", trim(col("role")))

# Define the categories
categories = ["Data Analyst", "Data Engineer", "Hybrid Data Analyst and Data Engineer", "Data scientist"]

# Categorize roles and group by role with count aggregation
df_roles = df_roles.withColumn("role", when(col("role").isin(categories), col("role")).otherwise("other"))
df_roles = df_roles.groupBy("role").count().orderBy(col("count").desc())

display(df_roles)

In [0]:
from pyspark.sql.functions import split, explode, col, trim, when

# Split the "activities" column by "," and explode into their own row, removing leading/trailing spaces
df_activities = df.select(explode(split(trim(col("activities")), ",")).alias("activities")).withColumn("activities", trim(col("activities")))

# Define the categories
# categories = ["Data Analyst", "Data Engineer", "Hybrid Data Analyst and Data Engineer", "Data scientist"]

# Categorize roles and group by role with count aggregation
# df_roles = df_roles.withColumn("role", when(col("role").isin(categories), col("role")).otherwise("other"))
df_activities = df_activities.groupBy("activities").count().orderBy(col("activities").desc())

display(df_activities)