In [0]:
# Introduction
# Author: Martand Singh
# Gmail: martandsays@gmail.com
# Facebook: https://www.facebook.com/codemakerz
# Dataset: https://data.world/carlvlewis/u-s-weather-outliers-1964
# In this notebook - we will setup of our enviornment. we will learn:
# 1. How to explore you data & grouping data. 
# 2. How to check data summary
# 3. how to check total count
# 4. How to check column info
# 5. How to visualize basic chart using databricks in-built notebook 

In [0]:
%sql
SELECT
  *
FROM
  weather_part
limit
  10;

degrees_from_mean,id,longitude,latitude,max_temp,min_temp,station_name,type,serialid,date_new
-10.67,USC00293530,-108.2075,33.1975,12.8,-18.3,GILA HOT SPRINGS,Weak Cold,73837,1964-01-10
-9.96,USC00295150,-106.7611,34.7675,9.4,-17.8,LOS LUNAS 3 SSW,Weak Cold,73838,1964-01-10
-9.87,USC00429136,-113.6667,37.3522,5.0,-13.3,VEYO PWR HOUSE,Weak Cold,73839,1964-01-10
-10.9,USC00340256,-95.615,34.2208,11.7,-12.2,ANTLERS,Weak Cold,73840,1964-01-10
-8.5,USC00046635,-116.5097,33.8275,17.2,-2.2,PALM SPRINGS,Weak Cold,73841,1964-01-10
-6.86,USW00023136,-119.0833,34.2167,16.1,-0.6,CAMARILLO AP,Weak Cold,73842,1964-01-10
-9.55,USC00026865,-114.2272,33.665,13.9,-5.6,QUARTZSITE,Weak Cold,73843,1964-01-10
-8.41,USC00029287,-112.7403,33.9792,17.8,-8.3,WICKENBURG,Weak Cold,73844,1964-01-10
-11.71,USC00029334,-109.8369,32.2553,16.7,-14.4,WILLCOX,Weak Cold,73845,1964-01-10
-8.85,USW00023158,-114.7142,33.6186,15.0,-3.3,BLYTHE AP,Weak Cold,73846,1964-01-10


In [0]:
# Check the shape of data - total rows * columns

In [0]:
%sql
-- Check total row counts
SELECT
  COUNT(1) AS total
FROM
  weather_part

total
3196832


In [0]:
%sql
-- check columns
SHOW COLUMNS FROM weather_part;

col_name
degrees_from_mean
id
longitude
latitude
max_temp
min_temp
station_name
type
serialid
date_new


In [0]:
# Using python
# Get shape - this is fine if dataset is smaller
spark.sql("select * from weather_part").toPandas().shape

In [0]:
# for bigger dataframe. Compare the execution time of above statement & this one.
df = spark.sql("select * from weather_part")
(df.count(), len(df.columns))

In [0]:
# Describe dataset to check column properties

In [0]:
%sql 
DESCRIBE weather_part

col_name,data_type,comment
degrees_from_mean,string,
id,string,
longitude,string,
latitude,string,
max_temp,string,
min_temp,string,
station_name,string,
type,string,
serialid,string,
date_new,date,


In [0]:
# python
df.printSchema()

In [0]:
# summary Statistics

In [0]:
# python
display(df.summary())

summary,degrees_from_mean,id,longitude,latitude,max_temp,min_temp,station_name,type,serialid
count,3196832.0,3196832,3196832.0,3196832.0,3196832.0,3196832.0,3196832,3196832,3196832.0
mean,-0.2012911720102908,,-97.81746762297796,39.28313621372672,18.556500497993,5.600243741303888,,,1598416.5
stddev,14.618175581996065,,17.388856738132166,6.431865683932281,14.696107656099484,14.00032951095022,,,922846.052214561
min,-0.01,GQW00041415,-100.0356,13.4836,-0.5,-0.5,ABERDEEN,Strong Cold,1.0
25%,-11.32,,-109.4847,34.9894,10.0,-2.2,,,799048.0
50%,5.47,,-95.975,39.5092,21.1,7.8,,,1598175.0
75%,10.76,,-85.1483,43.5706,30.0,16.7,,,2397459.0
max,92.76,VQW00011640,144.7961,71.2833,9.4,97.2,ZUNI,Weak Hot,999999.0


In [0]:
# Visualization

In [0]:
# Create a new tale which contains data grouped by station names
!ls /dbfs/mnt/weatherdata

In [0]:
%sql

-- Group your data using station_name, type
SELECT
  station_name,
  type as weather_type,
  avg(min_temp) as avg_min,
  avg(max_temp) as avg_max
FROM
  weather_part
GROUP BY
  station_name,
  type
  LIMIT 10

station_name,weather_type,avg_min,avg_max
WICKENBURG,Weak Cold,4.151999999999997,21.34247619047619
PORTLAND INTL AP,Strong Cold,-7.107692307692307,-0.8076923076923075
JENNINGS,Strong Cold,0.5129032258064516,10.110752688172044
WALNUT CREEK,Strong Cold,-15.580851063829789,2.080851063829788
ANACORTES,Strong Cold,-6.852702702702703,0.158108108108108
COLD BAY AP,Weak Hot,5.372919605077575,11.448660084626235
STOCKTON 3 NNE,Weak Hot,13.2390990990991,24.957837837837847
FARMLAND 5 NNW,Weak Hot,14.33352272727273,26.40587121212121
EAST JORDAN,Weak Hot,10.231411530815114,23.02326043737576
SAGINAW MBS INTL AP,Weak Hot,12.661554621848737,24.235504201680676


In [0]:
%sql 
-- We need grouping data frequently, spo create a new delta table which stores the grouped data
-- use CTAS (Create Table AS) statement
CREATE TABLE IF NOT EXISTS weather_grouped USING DELTA LOCATION '/mnt/weather_grouped' AS (
  SELECT
    station_name,
    type as weather_type,
    avg(min_temp) as avg_min,
    avg(max_temp) as avg_max
  FROM
    weather_part
  GROUP BY
    station_name,
    type
)

num_affected_rows,num_inserted_rows


In [0]:
# To visualize run the query and at the bottom level of query output you can see a visualize button. Click that and you can see
# many plot options & chart types.

In [0]:
%sql
SELECT
  *
FROM
  weather_grouped
LIMIT 3000;

station_name,weather_type,avg_min,avg_max
WACO RGNL AP,Weak Hot,20.190688775510203,32.55829081632653
VERNAL 2SW,Weak Hot,7.213333333333333,22.39587301587302
ROOSEVELT ROADS,Weak Cold,20.561832061068703,27.73333333333333
YOSEMITE PARK HQ,Weak Hot,8.05852459016394,25.996885245901623
ANTIOCH PUMPING PLANT #3,Weak Hot,13.959823008849554,28.992035398230087
DODGE,Weak Cold,-10.369483568075117,5.212676056338027
SOLON SPRINGS,Weak Cold,-10.95478036175711,2.222997416020673
HAYFIELD PUMPING PLT,Strong Cold,3.4287878787878774,18.322727272727263
WOODBURY 1 WNW,Weak Cold,-0.3932525951557094,13.435121107266433
COLQUITT 2 W,Weak Cold,6.498507462686565,19.53522388059702


In [0]:
%sql
SELECT
  *
FROM
  weather_grouped
  Limit 3000


station_name,weather_type,avg_min,avg_max
WACO RGNL AP,Weak Hot,20.190688775510203,32.55829081632653
VERNAL 2SW,Weak Hot,7.213333333333333,22.39587301587302
ROOSEVELT ROADS,Weak Cold,20.561832061068703,27.73333333333333
YOSEMITE PARK HQ,Weak Hot,8.05852459016394,25.996885245901623
ANTIOCH PUMPING PLANT #3,Weak Hot,13.959823008849554,28.992035398230087
DODGE,Weak Cold,-10.369483568075117,5.212676056338027
SOLON SPRINGS,Weak Cold,-10.95478036175711,2.222997416020673
HAYFIELD PUMPING PLT,Strong Cold,3.4287878787878774,18.322727272727263
WOODBURY 1 WNW,Weak Cold,-0.3932525951557094,13.435121107266433
COLQUITT 2 W,Weak Cold,6.498507462686565,19.53522388059702


In [0]:
%sql
SELECT
  *
FROM
  weather_grouped
  Limit 3000


station_name,weather_type,avg_min,avg_max
WACO RGNL AP,Weak Hot,20.190688775510203,32.55829081632653
VERNAL 2SW,Weak Hot,7.213333333333333,22.39587301587302
ROOSEVELT ROADS,Weak Cold,20.561832061068703,27.73333333333333
YOSEMITE PARK HQ,Weak Hot,8.05852459016394,25.996885245901623
ANTIOCH PUMPING PLANT #3,Weak Hot,13.959823008849554,28.992035398230087
DODGE,Weak Cold,-10.369483568075117,5.212676056338027
SOLON SPRINGS,Weak Cold,-10.95478036175711,2.222997416020673
HAYFIELD PUMPING PLT,Strong Cold,3.4287878787878774,18.322727272727263
WOODBURY 1 WNW,Weak Cold,-0.3932525951557094,13.435121107266433
COLQUITT 2 W,Weak Cold,6.498507462686565,19.53522388059702


In [0]:
# The main purpose of this notebook is to show how to plot data. So if data plots does not make any sense to you, it is ok. We will use
# another data in future to undertsand more about charts and data relationship.
# Thank you