In [None]:
import numpy as np
import pandas as pd
import re
bank_txt = spark.sparkContext.textFile("/data/examples/bank.csv")

bank_df = bank_txt.map(lambda s: re.sub(r'(?:^\"|\"$)', '', s))\
    .map(lambda s: re.split('\"?;\"?', s))\
    .filter(lambda s: s[0] != "age")\
    .map(lambda s: [int(s[0])] + s[1:4] + [int(s[5])]).toDF(['age', 'job', 'marital', 'education', 'balance'])
bank_df.printSchema()

In [None]:
bank_df2 = spark.read.csv("/data/examples/bank.csv",
                         inferSchema=True, header=True, sep=";", quote='"')
bank_df2.toPandas().head()

In [None]:
bank_df = bank_df2.select(['age', 'job', 'marital', 'education', 'balance'])
bank_df.limit(5).toPandas()

In [None]:
bank_df.createOrReplaceTempView('bank_view')

In [None]:
em_pdf = spark.sql('''SELECT education, marital, count(*) AS count FROM bank_view
WHERE education != 'unknown' GROUP BY education,marital''').toPandas()
em_pdf.head(3)

In [None]:
pdf = em_pdf.pivot(index='education', columns='marital', values='count')
pdf.head(3)

In [None]:
import matplotlib.pyplot as plt
pdf.plot(kind='bar', stacked=True)
plt.show()

In [None]:
df = spark.read.csv("/data/examples/bank.csv", header=True, sep=';', 
                      mode="DROPMALFORMED", inferSchema=True)
df.write.mode("overwrite").saveAsTable("default.bank");
df.printSchema()

In [None]:
pd.DataFrame(spark.catalog.listTables("default"))

In [None]:
bank_df = spark.read.table("bank")
bank_jobs = bank_df.groupBy("job").count()
bank_jobs.createOrReplaceTempView("bank_jobs")
jobs_pd = spark.sql("select * from bank_jobs order by count desc limit 10").toPandas()
jobs_pd.head()

In [None]:
jobs_pd.plot(kind='bar', x='job', y='count', stacked=True)
plt.show()