# The first section begins with using spark, sql and hive context to create pandas data frames saved as .csv files from the parquet file on the cluster.

The following cells will all execute in order on the cluster. 

In [None]:
import findspark

In [None]:
try:
    findspark.init(spark_home="/opt/cloudera/parcels/SPARK2/lib/spark2")
except:
    # or if you have it somewhere else
    findspark.init(spark_home="/Users/danielacuna/Downloads/spark-2.0.0-bin-hadoop2.7")

In [None]:
import pyspark
import numpy as np

In [None]:
conf = pyspark.SparkConf().\
    set('spark.port.maxRetries', 60).\
    setMaster('local[2]')

In [None]:
from pyspark.sql import SQLContext, HiveContext
sc = pyspark.SparkContext(conf=conf)
sqlContext = HiveContext(sc)

In [None]:
# read the parquet file
mapRed_parquet = sqlContext.read.parquet("/user/alain/IST718/data/mapRed.parquet")

In [None]:
# register it as a temporary table
mapRed_parquet.registerTempTable("weather")

In [None]:
# View the column names in temporary table
mapRed_parquet.columns

In [None]:
# The following query selects the precipitation and max temperature elements from the weather table for US only and groups 
# these by year, month and element in ascending time order. 
t = sqlContext.sql('''select Year, Month, Element, AVG(MonthAverage) as AverageValue 
                    from weather
                    where Element IN ("PRCP", "TMAX") AND CountryCode == "US"
                    group by Month, Year, Element
                    order by Year ASC, Month ASC''')

# Check the results
t.count(), t.show(20) 

In [None]:
# Save t into a CSV file on the cluster
t.toPandas().to_csv('/users/alain/mam/US_Prcp_Tmax_byMonth.csv', index = False) 

In [None]:
# Save the df as a .csv in this notebook
US_Prcp_Tmax_byMonth_DF.to_csv('US_Prcp_Tmax_byMonth_DF.csv', index = False)

In [None]:
# The following query selects the precipitation and max temperature elements from the weather table for the globe and groups 
# these by year and element in ascending time order. 
u = sqlContext.sql('''select Year, Element, AVG(MonthAverage) as AverageValue 
                    from weather
                    where Element IN ("PRCP", "TMAX") 
                    group by Year, Element
                    order by Year ASC''')

# Check the results
u.count(), u.show(20) 

In [None]:
# Save u into a CSV file on the cluster
u.toPandas().to_csv('/users/alain/mam/Global_Prcp_Tmax_byYear.csv', index = False) 

In [None]:
# Save the df as a .csv in this notebook
Global_Prcp_Tmax_byYear_DF.to_csv('Global_Prcp_Tmax_byYear_DF.csv', index = False)

# This second section begins with the two .csv files that were downloaded from the cluster and will now be used to create visualizations on a local machine using pandas and matplotlib.

The following cells will all execute in order from your local machine; if you would like the two .csv files, I can send them to you.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read the .csv file from the notebooks' home directory
US_Prcp_Tmax_byMonth_DF = pd.read_csv('US_Prcp_Tmax_byMonth_DF.csv')

In [3]:
del US_Prcp_Tmax_byMonth_DF['Time']

In [4]:
US_Prcp_Tmax_byMonth_DF.head(5)

Unnamed: 0,Year,Month,Element,AverageValue
0,1836,7,PRCP,48.777778
1,1836,8,PRCP,15.387097
2,1836,9,PRCP,80.833333
3,1836,10,PRCP,9.580645
4,1836,11,PRCP,22.633333


In [5]:
# Here, I will create four bubble plots for the US precipitation values mapped by season for each year from 1900 to 2016.
################################################################################################################################

In [6]:
# This new average value will be properly scaled to Fehrenheit and mm as the original average value was tenths of degrees celsius
# for max temperature and tenths of mm for precipitation. 
AverageValue2 = []
for i in US_Prcp_Tmax_byMonth_DF.index:
    if (US_Prcp_Tmax_byMonth_DF['Element'][i] == 'PRCP'):
        AverageValue2.append(US_Prcp_Tmax_byMonth_DF['AverageValue'][i]/10)
    else:
        AverageValue2.append((9*US_Prcp_Tmax_byMonth_DF['AverageValue'][i]/50)+32) 

In [7]:
US_Prcp_Tmax_byMonth_DF['AverageValue2_F_mm'] = AverageValue2

In [8]:
US_Prcp_Tmax_byMonth_DF.head(5)

Unnamed: 0,Year,Month,Element,AverageValue,AverageValue2_F_mm
0,1836,7,PRCP,48.777778,4.877778
1,1836,8,PRCP,15.387097,1.53871
2,1836,9,PRCP,80.833333,8.083333
3,1836,10,PRCP,9.580645,0.958065
4,1836,11,PRCP,22.633333,2.263333


In [9]:
# Select only the precipitation data.
US_Prcp_byMonth_DF = US_Prcp_Tmax_byMonth_DF[US_Prcp_Tmax_byMonth_DF.Element != 'TMAX']

In [10]:
# Function to add seasons
def season_type(df):
    m = int(df['Month'])
    spring = range(3, 6)
    summer = range(6, 9)
    fall = range(9,12)
    if m in spring:
        season = "Spring"
    elif m in summer:
        season = "Summer"
    elif m in fall:
        season = "Fall"
    else:
        season = "Winter"
    return season

# applying the Function to the dataframe
US_Prcp_byMonth_DF['Season'] = US_Prcp_byMonth_DF.apply(season_type, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
US_Prcp_byMonth_DF.head(5)

Unnamed: 0,Year,Month,Element,AverageValue,AverageValue2_F_mm,Season
0,1836,7,PRCP,48.777778,4.877778,Summer
1,1836,8,PRCP,15.387097,1.53871,Summer
2,1836,9,PRCP,80.833333,8.083333,Fall
3,1836,10,PRCP,9.580645,0.958065,Fall
4,1836,11,PRCP,22.633333,2.263333,Fall


In [12]:
# Select only the data for years 1900 - 2016.
US_Prcp_byMonth_DF = US_Prcp_byMonth_DF[US_Prcp_byMonth_DF.Year > 1899]

In [13]:
US_Prcp_byMonth_DF.head(5)

Unnamed: 0,Year,Month,Element,AverageValue,AverageValue2_F_mm,Season
1155,1900,1,PRCP,19.453504,1.94535,Winter
1158,1900,2,PRCP,28.566417,2.856642,Winter
1160,1900,3,PRCP,23.952534,2.395253,Spring
1162,1900,4,PRCP,27.880507,2.788051,Spring
1163,1900,5,PRCP,23.350901,2.33509,Spring


In [14]:
# Select only the summer data.
US_Prcp_Summer_byMonth_DF = US_Prcp_byMonth_DF[US_Prcp_byMonth_DF.Season == 'Summer']

In [15]:
# Plot the summer bubble plot.
x = US_Prcp_Summer_byMonth_DF['Year']
y = US_Prcp_Summer_byMonth_DF['AverageValue2_F_mm'] 
s = 60*US_Prcp_Summer_byMonth_DF['AverageValue2_F_mm']**4
Plot1 = plt.scatter(x,y,s=s, c='orange', edgecolor='orange', alpha=.5)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.title("US Precipitation by Year for Summer Months Only\n (June,July,Aug)",fontsize=30)
plt.xlabel("Time By Year",fontsize=25)
plt.ylabel("Seasonal Average Precipitation (mm)",fontsize=25)
plt.show()

In [16]:
# Select only the spring data.
US_Prcp_Spring_byMonth_DF = US_Prcp_byMonth_DF[US_Prcp_byMonth_DF.Season == 'Spring']

In [17]:
# Plot the spring bubble plot.
x = US_Prcp_Spring_byMonth_DF['Year']
y = US_Prcp_Spring_byMonth_DF['AverageValue2_F_mm'] 
s = 60*US_Prcp_Spring_byMonth_DF['AverageValue2_F_mm']**4
Plot1 = plt.scatter(x,y,s=s, c='green', edgecolor='green', alpha=.5)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.title("US Precipitation by Year for Spring Months Only\n (March, April, May)",fontsize=30)
plt.xlabel("Time By Year",fontsize=25)
plt.ylabel("Seasonal Average Precipitation (mm)",fontsize=25)
plt.show()

In [18]:
# Select only the fall data.
US_Prcp_Fall_byMonth_DF = US_Prcp_byMonth_DF[US_Prcp_byMonth_DF.Season == 'Fall']

In [19]:
# Plot the fall bubble plot>
x = US_Prcp_Fall_byMonth_DF['Year']
y = US_Prcp_Fall_byMonth_DF['AverageValue2_F_mm'] 
s = 60*US_Prcp_Fall_byMonth_DF['AverageValue2_F_mm']**4
Plot1 = plt.scatter(x,y,s=s, c='red', edgecolor='red', alpha=.5)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.title("US Precipitation by Year for Fall Months Only\n (Sept,Oct,Nov)",fontsize=30)
plt.xlabel("Time By Year",fontsize=25)
plt.ylabel("Seasonal Average Precipitation (mm)",fontsize=25)
plt.show()

In [20]:
US_Prcp_Winter_byMonth_DF = US_Prcp_byMonth_DF[US_Prcp_byMonth_DF.Season == 'Winter']

In [21]:
x = US_Prcp_Winter_byMonth_DF['Year']
y = US_Prcp_Winter_byMonth_DF['AverageValue2_F_mm'] 
s = 60*US_Prcp_Winter_byMonth_DF['AverageValue2_F_mm']**4
Plot1 = plt.scatter(x,y,s=s, c='blue', edgecolor='blue', alpha=.5)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.title("US Precipitation by Year for Winter Months Only\n (Dec,Jan,Feb)",fontsize=30)
plt.xlabel("Time By Year",fontsize=25)
plt.ylabel("Seasonal Average Precipitation (mm)",fontsize=25)
plt.show()

In [22]:
# Now, I will use the same US dataset for both precipitation and max temperature for all time graphed by month and create a 
# scatter plot with different colors for precipitation and max temperature.
#########################################################################################################################

In [23]:
# I will be graphing this by month over different years so the first step is to create a date time variable to use as the
# index.
US_Prcp_Tmax_byMonth_DF.head(5)

Unnamed: 0,Year,Month,Element,AverageValue,AverageValue2_F_mm
0,1836,7,PRCP,48.777778,4.877778
1,1836,8,PRCP,15.387097,1.53871
2,1836,9,PRCP,80.833333,8.083333
3,1836,10,PRCP,9.580645,0.958065
4,1836,11,PRCP,22.633333,2.263333


In [24]:
# Create a data frame of only the time values needed for the date time variable with '1' as the day for every month since the day 
# is not specified in our data. 
time_DF = US_Prcp_Tmax_byMonth_DF[['Year', 'Month']]
time_DF['Day'] = [1]*len(time_DF)
time_DF.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Year,Month,Day
0,1836,7,1
1,1836,8,1
2,1836,9,1
3,1836,10,1
4,1836,11,1


In [25]:
# Now, create the color variable for the graph which is different for precipitation and max temperature.
Weather = []
for i in US_Prcp_Tmax_byMonth_DF.index:
    if (US_Prcp_Tmax_byMonth_DF['Element'][i] == 'PRCP'):
        Weather.append([1,0,0])
    else:
        Weather.append([0,1,0])

In [26]:
US_Prcp_Tmax_byMonth_DF['Element_Colors'] = Weather

In [27]:
# Now use the date time variable as the index.
US_Prcp_Tmax_byMonth_DF.index = pd.to_datetime(time_DF)
US_Prcp_Tmax_byMonth_DF.head(5)

Unnamed: 0,Year,Month,Element,AverageValue,AverageValue2_F_mm,Element_Colors
1836-07-01,1836,7,PRCP,48.777778,4.877778,"[1, 0, 0]"
1836-08-01,1836,8,PRCP,15.387097,1.53871,"[1, 0, 0]"
1836-09-01,1836,9,PRCP,80.833333,8.083333,"[1, 0, 0]"
1836-10-01,1836,10,PRCP,9.580645,0.958065,"[1, 0, 0]"
1836-11-01,1836,11,PRCP,22.633333,2.263333,"[1, 0, 0]"


In [28]:
# Here I will creat rectangles for the legend.
import matplotlib.patches as mpatches

classes = ['Precipitation (mm)','Maximum Temperature (°F)']
class_colours = [[1,0,0],[0,1,0]]
recs = []
for i in range(0,len(class_colours)):
    recs.append(mpatches.Rectangle((0,0),1,1,fc=class_colours[i]))

In [29]:
# Plot the US precipitation and max temperature values monthly for all time with a color scheme for the points that is in the 
# legend.
x = US_Prcp_Tmax_byMonth_DF.index
y = US_Prcp_Tmax_byMonth_DF['AverageValue2_F_mm'] 
c = US_Prcp_Tmax_byMonth_DF['Element_Colors']
Plot1 = plt.scatter(x,y,s=75, c=c, edgecolor=c)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.title("United States Monthly Average Precipitation \n and Maximum Temperature",fontsize=30)
plt.xlabel("Time By Month (July 1836 - November 2016)",fontsize=25)
plt.ylabel("Monthly Average Value",fontsize=25)
plt.legend(recs,classes,loc=4,fontsize=25)
plt.show()

In [30]:
# Now, I will do a similar visualization as the last which was for the US only; this one is for the World
# with precipitation and max temperature values and is mapped by year instead of month.
###############################################################################################################

In [31]:
# Read the .csv file.
Global_Prcp_Tmax_byYear_DF = pd.read_csv('Global_Prcp_Tmax_byYear_DF.csv')
Global_Prcp_Tmax_byYear_DF.head(5)

Unnamed: 0,Year,Element,AverageValue
0,1763,TMAX,147.524027
1,1764,TMAX,153.414176
2,1765,TMAX,149.41851
3,1766,TMAX,148.438518
4,1767,TMAX,141.643689


In [32]:
# Create the color variable based on the emement type; different colors for precipitation and max temperature.
Weather2 = []
for i in Global_Prcp_Tmax_byYear_DF.index:
    if (Global_Prcp_Tmax_byYear_DF['Element'][i] == 'TMAX'):
        Weather2.append([0,1,0])
    else:
        Weather2.append([0,0,1])

In [33]:
Global_Prcp_Tmax_byYear_DF['Element_Colors'] = Weather2

In [34]:
# This new average value will be properly scaled to Fehrenheit and MM. 
AverageValue2 = []
for i in Global_Prcp_Tmax_byYear_DF.index:
    if (Global_Prcp_Tmax_byYear_DF['Element'][i] == 'PRCP'):
        AverageValue2.append(Global_Prcp_Tmax_byYear_DF['AverageValue'][i]/10)
    else:
        AverageValue2.append((9*Global_Prcp_Tmax_byYear_DF['AverageValue'][i]/50)+32) 

In [35]:
Global_Prcp_Tmax_byYear_DF['AverageValue2_F_mm'] = AverageValue2

In [36]:
Global_Prcp_Tmax_byYear_DF.head(5)

Unnamed: 0,Year,Element,AverageValue,Element_Colors,AverageValue2_F_mm
0,1763,TMAX,147.524027,"[0, 1, 0]",58.554325
1,1764,TMAX,153.414176,"[0, 1, 0]",59.614552
2,1765,TMAX,149.41851,"[0, 1, 0]",58.895332
3,1766,TMAX,148.438518,"[0, 1, 0]",58.718933
4,1767,TMAX,141.643689,"[0, 1, 0]",57.495864


In [37]:
# Now create the rectangles for the legend.
classes = ['Maximum Temperature (°F)', 'Precipitation (mm)']
class_colours = [[0,1,0],[0,0,1]]
recs = []
for i in range(0,len(class_colours)):
    recs.append(mpatches.Rectangle((0,0),1,1,fc=class_colours[i]))

In [38]:
# Now, plot the global average max temperatures and precipitation for all yearly periods of time.
x = Global_Prcp_Tmax_byYear_DF['Year']
y = Global_Prcp_Tmax_byYear_DF['AverageValue2_F_mm'] 
c = Global_Prcp_Tmax_byYear_DF['Element_Colors']
Plot1 = plt.scatter(x,y,s=75, c=c, edgecolor=c)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.title("Global Yearly Average Precipitation \n and Maximum Temperature",fontsize=30)
plt.xlabel("Time By Year (1763 - 2016)",fontsize=25)
plt.ylabel("Yearly Average Value",fontsize=25)
plt.ylim(ymax = 70, ymin = 0)
plt.xlim(xmax = 2020, xmin = 1760)
plt.legend(recs,classes,loc=1,fontsize=20)
plt.show()