In [11]:
# To get rid of those blocks of red warnings
import warnings
warnings.filterwarnings("ignore")

# Standard Imports
import numpy as np
from scipy import stats
import pandas as pd
import os
from scipy.stats import spearmanr
from sklearn import metrics
from random import randint
from typing import Dict, List, Optional, Union, cast
from time import sleep
import pyspark
from pydataset import data
from pyspark.sql.functions import col, expr, lit, regexp_extract, regexp_replace
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean, when
from pyspark.sql.functions import asc, desc, month, year, quarter

# Vis Imports
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Modeling Imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import sklearn.preprocessing
import statsmodels.api as sm
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# NLP Imports
import unicodedata
import re
import json
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

## Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/20 11:36:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
np.random.seed(123)

code_lan = ['python', 'java', 'c', 'sas']

pandas_dataframe = pd.DataFrame(
    dict(language=np.random.choice(code_lan, 20))
)
pandas_dataframe

Unnamed: 0,language
0,c
1,java
2,c
3,c
4,python
5,c
6,c
7,java
8,sas
9,c


In [4]:
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

                                                                                

+--------+
|language|
+--------+
|       c|
|    java|
|       c|
|       c|
|  python|
+--------+
only showing top 5 rows



In [7]:
df.printSchema()

root
 |-- language: string (nullable = true)



In [9]:
print((df.count(), len(df.columns)))


[Stage 2:>                                                        (0 + 10) / 10]

(20, 1)


                                                                                

## Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:
    - The 1999 audi a4 has a 4 cylinder engine.
    
For each vehicle.
- Transform the trans column so that it only contains either manual or auto.

In [12]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [18]:
mpg.select(concat(lit("The "), mpg.year, lit(" "), mpg.manufacturer, lit(" "), mpg.model, 
                  lit(" has a "), mpg.cyl, lit(" cylinder ")).alias("col")).show(5)


+--------------------+
|                 col|
+--------------------+
|The 1999 audi a4 ...|
|The 1999 audi a4 ...|
|The 2008 audi a4 ...|
|The 2008 audi a4 ...|
|The 1999 audi a4 ...|
+--------------------+
only showing top 5 rows



In [21]:
mpg.select(regexp_extract("trans", r"(\w+)", 1).alias("trans")).show(truncate=False)

+------+
|trans |
+------+
|auto  |
|manual|
|manual|
|auto  |
|auto  |
|manual|
|auto  |
|manual|
|auto  |
|manual|
|auto  |
|auto  |
|manual|
|auto  |
|manual|
|auto  |
|auto  |
|auto  |
|auto  |
|auto  |
+------+
only showing top 20 rows



In [22]:
mpg.select(mpg.trans, when(mpg.trans.like('%auto%'), "auto")
          .otherwise('manual').alias('trans')).show(10)

+----------+------+
|     trans| trans|
+----------+------+
|  auto(l5)|  auto|
|manual(m5)|manual|
|manual(m6)|manual|
|  auto(av)|  auto|
|  auto(l5)|  auto|
|manual(m5)|manual|
|  auto(av)|  auto|
|manual(m5)|manual|
|  auto(l5)|  auto|
|manual(m6)|manual|
+----------+------+
only showing top 10 rows



## Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [23]:
tips = spark.createDataFrame(data('tips'))


In [26]:
tips.select((count(tips.smoker == 'Yes') / (len(tips.columns))).alias('percentage_smokers')).show()


+------------------+
|percentage_smokers|
+------------------+
|34.857142857142854|
+------------------+



In [25]:
tips.select(tips.total_bill, tips.tip, (tips.tip / tips.total_bill).alias('tip_percentage')).show()


+----------+----+-------------------+
|total_bill| tip|     tip_percentage|
+----------+----+-------------------+
|     16.99|1.01|0.05944673337257211|
|     10.34|1.66|0.16054158607350097|
|     21.01| 3.5|0.16658733936220846|
|     23.68|3.31| 0.1397804054054054|
|     24.59|3.61|0.14680764538430255|
|     25.29|4.71|0.18623962040332148|
|      8.77| 2.0|0.22805017103762829|
|     26.88|3.12|0.11607142857142858|
|     15.04|1.96|0.13031914893617022|
|     14.78|3.23| 0.2185385656292287|
|     10.27|1.71| 0.1665043816942551|
|     35.26| 5.0|0.14180374361883155|
|     15.42|1.57|0.10181582360570687|
|     18.43| 3.0|0.16277807921866522|
|     14.83|3.02|0.20364126770060686|
|     21.58|3.92|0.18164967562557924|
|     10.33|1.67| 0.1616650532429816|
|     16.29|3.71|0.22774708410067526|
|     16.97| 3.5|0.20624631703005306|
|     20.65|3.35|0.16222760290556903|
+----------+----+-------------------+
only showing top 20 rows



In [27]:
tips.rollup('sex', 'smoker').agg(mean((tips.tip / tips.total_bill)).alias('average_tip_percentage')).show()


[Stage 20:>                                                       (0 + 10) / 10]

+------+------+----------------------+
|   sex|smoker|average_tip_percentage|
+------+------+----------------------+
|Female|  null|   0.16649073632892478|
|  Male|    No|    0.1606687151291298|
|  null|  null|   0.16080258172250475|
|Female|    No|   0.15692097076918363|
|  Male|  null|   0.15765054700429745|
|  Male|   Yes|    0.1527711752024851|
|Female|   Yes|   0.18215035269941032|
+------+------+----------------------+



                                                                                

## Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [28]:
from vega_datasets import data
weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)

In [29]:
weather.select((weather.temp_max * 9/5 + 32).alias('temp_max_f'), 
               (weather.temp_min * 9/5 + 32).alias('temp_min_f')).show(10)

+------------------+----------+
|        temp_max_f|temp_min_f|
+------------------+----------+
|             55.04|      41.0|
|             51.08|     37.04|
|             53.06|     44.96|
|             53.96|     42.08|
|             48.02|     37.04|
|             39.92|     35.96|
|             44.96|     37.04|
|              50.0|     37.04|
|             48.92|      41.0|
|42.980000000000004|     33.08|
+------------------+----------+
only showing top 10 rows



In [31]:
(weather.withColumn("month", month("date"))
    .groupBy("month")
    .agg(mean("precipitation").alias("mean_rainfall"))
    .sort(desc("mean_rainfall"))
    .show(1))

[Stage 27:>                                                       (0 + 10) / 10]

+-----+-----------------+
|month|    mean_rainfall|
+-----+-----------------+
|   11|5.354166666666667|
+-----+-----------------+
only showing top 1 row



                                                                                

In [32]:
(weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(mean("wind").alias("annual_wind"))
    .sort(desc("annual_wind"))
    .show(1)
)

[Stage 30:>                                                       (0 + 10) / 10]

+----+-----------------+
|year|      annual_wind|
+----+-----------------+
|2012|3.400819672131148|
+----+-----------------+
only showing top 1 row



                                                                                

In [33]:
(weather.filter(month("date") == 1)
    .groupBy("weather")
    .agg(count("weather").alias("count_weather"))
    .sort(desc("count_weather"))
    .show(1)
)

[Stage 33:>                                                       (0 + 10) / 10]

+-------+-------------+
|weather|count_weather|
+-------+-------------+
|    fog|           38|
+-------+-------------+
only showing top 1 row



                                                                                

In [34]:
(weather.filter(month("date") == 7)
    .filter((year('date') == 2013) | (year('date') == 2014))
    .filter(weather.weather == 'sun')
    .groupBy(year('date'))
    .agg(mean("temp_max"), mean('temp_min'))
    .show()
)

+----------+------------------+------------------+
|year(date)|     avg(temp_max)|     avg(temp_min)|
+----------+------------------+------------------+
|      2013|26.585185185185193|13.981481481481483|
|      2014|            27.092|14.400000000000002|
+----------+------------------+------------------+



In [35]:
(weather.filter(year("date") == 2015)
    .filter(quarter('date') == 3)
    .withColumn("rainy", (weather.weather == 'rain').cast("int"))
    .groupBy(quarter('date'))
    .agg(mean('rainy') * 100)
    .show()
)

+-------------+------------------+
|quarter(date)|(avg(rainy) * 100)|
+-------------+------------------+
|            3|2.1739130434782608|
+-------------+------------------+



In [67]:
(weather.withColumn("precipitation", (weather.precipitation != 0).cast("int"))
 .withColumn("year", year("date"))
 .groupBy("year")
 .agg((sum("precipitation")* 100) / 365)
 .show())




+----+----------------------------------+
|year|((sum(precipitation) * 100) / 365)|
+----+----------------------------------+
|2012|                 48.49315068493151|
|2013|                 41.64383561643836|
|2014|                  41.0958904109589|
|2015|                 39.45205479452055|
+----+----------------------------------+



                                                                                