## Additional Examples
Using pyshp (shapefile) and pandas

Data is downloaded from http://www.bom.gov.au/water/groundwater/explorer/map.shtml

In [None]:
#Load the modules
import shapefile

In [None]:
help(shapefile)

In [None]:
#Set the filename
boreshape='../data/shp_torrens_river/NGIS_BoreLine.shp'

#read in the file
shapeRead = shapefile.Reader(boreshape)

#And save out some of the shape file attributes
recs    = shapeRead.records()
shapes  = shapeRead.shapes()
fields  = shapeRead.fields
Nshp    = len(shapes)

In [None]:
print(Nshp) #print the Number of items in the shapefile

In [None]:
fields[:]#print the fields

In [None]:
recs[0] #print the first record, then this is a list that can be subscripted further

In [None]:
shapes[0].points #print the point values of the first shape

Shapefiles are not a native python format, but the community have developed tools for exploring them. The package we have used "pyshp" imported with the name "shapefile" (for some non-consistent weird reason), is one example of working with shapefiles. Alternatives exist.

## More table manipulation

In [None]:
#Import the module
import pandas

In [None]:
#read in the data
log_data=pandas.read_csv("../data/shp_torrens_river/NGIS_LithologyLog.csv",\
                         header=0,sep=',',skipinitialspace=True,quotechar ='"',\
                         usecols=list(range(0,13)),\
                         skiprows=[453,456,458,460,689,697,720,723,726,839,880,884,885,890,898,934])

#This data was weird because it has quotation marks to signify inches inside comments within the file, 
#making automatic reading of it tricky

In [None]:
log_data           # print the first 30 and last 30 rows

In [None]:
# add a new column as a function of existing columns
log_data['Thickness'] = log_data.ToDepth - log_data.FromDepth

In [None]:
type(log_data)     # see what Python type the DataFrame is

In [None]:
log_data.head(3)    # print the first 3 rows

In [None]:
log_data.index     # “the index” (aka “the labels”). 
#Pandas is great for using timeseries data, where the index can be the timestamps

In [None]:
log_data.columns   # column names (which is “an index”)

In [None]:
log_data.dtypes    # data types of each column

In [None]:
log_data.shape     # number of rows and columns

In [None]:
log_data.values    # underlying numpy array — df are stored as numpy arrays for effeciencies.

In [None]:
#log_data['MajorLithCode']         # select one column
##Equivalent to 
#log_data.MajorLithCode 
##and
#log_data.iloc[:,9]

In [None]:
type(log_data['MajorLithCode'])   # determine datatype of column (e.g., Series)

In [None]:
#describe the data frame
log_data.describe(include='all')     

In [None]:
# summarise a panda Series
log_data.FromDepth.describe()   # describe a single column

In [None]:
#calculate mean of 5th column ("FromDepth")
log_data.iloc[:,5].mean()      

In [None]:
#alternate method to calculate mean of FromDepth column (the 5th one)
log_data["FromDepth"].mean()    

In [None]:
#Count how many Lith Codes there are
lithCounts=log_data.MajorLithCode.value_counts()

In [None]:
#Print the lithcodes, use .index or .values 
lithCounts

In [None]:
#plot a bar chart of the lith codes
lithCounts.plot.bar(rot=90,figsize=(15,5))

In [None]:
#Plot a bar chart of the lith codes for the rarer lithologies
lithCounts[(lithCounts < 50)].plot.bar(rot=90,figsize=(15,5))

In [None]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
 
# example data
mu = np.mean(log_data['Thickness'].values) # mean of distribution
sigma = np.std(log_data['Thickness'].values) # standard deviation of distribution
x = log_data['Thickness'].values
# the histogram of the data
plt.hist(x, bins=[0,0.25,0.5,0.75,1.0,1.25,1.5,1.75,2,2.25,2.5,2.75,3.0], alpha=0.5)
plt.xlabel('Thickness (m)')
plt.ylabel('Count')
mystring="Histogram with a mean of "+ str(mu)
plt.title(mystring)
 
# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
plt.show()




In [None]:
# import numpy as np
# cmap = plt.get_cmap('viridis')
# colors = cmap(np.linspace(0, 1, len(lithCounts.index)))
# colors

# for row in log_data.itertuples():
#     boreid=row[3]
#     for ind,value in enumerate(recs):  
#         try:
#             value.index(boreid)
#             print(recs)
#         except:
#             continue
#     #(row[3])



# for ind, value in enumerate(recs):
#     #Get the lat lon value
#     lon=value[18]
#     lat=value[17]
#     #Get the Lithology unit
#     value[]
    
#     #Now plot it
#     plt.plot(lon,lat,"|")