# 1. Read and pre-process the data

## 1.1 Read data

This is the first part of the code and is designed for checking and cleaning the data. The data used is taken from Alberta Water Quality Data Portal with focus on water matrix (0) and long-term and tributary monitoring stations:
Source for getting data: https://environment.extranet.gov.ab.ca/apps/WaterQuality/dataportal/DataDownload/Index/

In [1]:
import numpy as np
import pandas as pd
import os
import sys
# add src folder to path
sys.path.append(os.path.abspath('../src'))

# import funcs from src folder in higher directory
from funcs import unit_harm

In [2]:
# Specify parse date:
parse_dates = ['SampleDateTime']

# Now read the data
data = pd.read_csv('../data/Water Quality-2025-03-08 172848.csv', dtype=
{
 'ProjectNumber': 'category',
 'SampleNumber': 'category',
 'ContinentalRiverBasinCode': 'category',
 'RiverBasinCode': 'category',
 'RiverSubBasinCode': 'category',
 'StationTypeCode': 'category',
 'StationNumber': 'category',
 'Station': 'category',
 'LatitudeDecimalDegrees': 'float64',
 'LongitudeDecimalDegrees': 'float64',
 'SampleMatrixCode': 'category',
 'SampleTypeCode': 'category',
 'CollectionCode': 'category',
 'QCSampleFlag': 'category',
 'SampleComment': 'string',
 'SampleDateTime': 'string',
 'VmvCode': 'category',
 'VariableCode': 'category',
 'VariableName': 'category',
 'MeasurementFlag': 'category',
 'MeasurementValue': 'float64',
 'UnitCode': 'category',
 'SampleDetectLimit': 'string',
 'MeasurementComment': 'string',
 'MeasurementQualifier': 'category',
 'MeasurementQualifierDescription': 'string',
 'MeasurementQualifierComment': 'string',
 'MethodCode': 'category',
 'MethodDetectionLimit': 'float64',
 'LabCode': 'category'
 }
 , na_values=['', 'NaN', 'NULL', 'N/A', 'NA', 'null', '******', 'NaT', 'nan'])

In [3]:
# convert the SampleDateTime column to datetime
data['SampleDateTime'] = pd.to_datetime(data['SampleDateTime'], format='%m/%d/%Y %H:%M:%S', errors='coerce')

# Convert SampleDetectLimit to numeric
data['SampleDetectLimit'] = pd.to_numeric(data['SampleDetectLimit'], errors='coerce')

# convert VariableName and Station to title case
data[['VariableName', 'Station']] = data[['VariableName', 'Station']].apply(lambda x: x.str.title())

# get rid of whitespace
data['VariableName'] = data['VariableName'].str.strip()

# make units lowercase
data['UnitCode'] = data['UnitCode'].str.lower()

# 1.2 Filter the data

In [4]:
# Check missingness for each column and arrange in descending order
data.isnull().sum().sort_values()

ProjectNumber                            0
VmvCode                                  0
SampleDateTime                           0
MeasurementValue                         0
QCSampleFlag                             0
MethodCode                               0
SampleTypeCode                           0
SampleMatrixCode                         0
VariableCode                             0
LongitudeDecimalDegrees                  0
Station                                  0
StationNumber                            0
StationTypeCode                          0
RiverSubBasinCode                        0
RiverBasinCode                           0
ContinentalRiverBasinCode                0
SampleNumber                             0
LatitudeDecimalDegrees                   0
VariableName                             0
LabCode                                288
UnitCode                             73616
CollectionCode                      200683
MethodDetectionLimit                386180
SampleComme

**Note:** it is the user's choice to not include the data without unit codes. I chose to eliminate them as they really create ambiguity in how usable the data will be. Just as a more general rule, we apply SampleDateTime, MeasurementValue, StationNumber, and VariableName. 

In [5]:
data = data.dropna(subset = ['UnitCode', 'MeasurementValue', 'SampleDateTime', 'StationNumber', 'VariableName'])
data = data.dropna(how = 'all') # drop rows where all elements are NaN

In [6]:
duplicates = data.duplicated(subset=['SampleDateTime', 'StationNumber', 'VariableName', 'MeasurementValue', 
                                     'UnitCode', 'VmvCode', 'SampleNumber', 'LabCode'], 
                             keep=False)
                             
# write duplicate rows to csv   
# data[duplicates].to_csv('../output/duplicate_rows.csv', index=False)

Basically there is no duplicate row based on the analysis of duplicates. 

# 1.3 Harmonize the units
Harmonize the units so that each parameter has only one unit. This is helpful for creating plots and comparing data against other stations or time-windows.

In [7]:
data = unit_harm(data)

1,1,1,2-Tetrachloroethane
1,1,1-Trichloroethane
1,1,2,2-Tetrachloroethane
1,1,2-Trichloroethane
1,1-Dichloroethane
1,1-Dichloroethylene
1,1-Dichloropropylene
1,2,3-Trichlorobenzene
1,2,3-Trichloropropane
1,2,4-Trichlorobenzene
1,2,4-Trimethylbenzene
1,2,6-Trimethylphenanthrene
1,2-Dibromo-3-Chloropropane
1,2-Dibromoethane
1,2-Dichlorobenzene
1,2-Dichloroethane
1,2-Dichloroethene-Cis
1,2-Dichloroethene-Trans
1,2-Dichloropropane
1,2-Dimethylnaphthalene
1,2-Diphenylhydrazine
1,2-Xylene
1,3,5-Trichlorobenzene
1,3,5-Trimethylbenzene
1,3-Dichlorobenzene
1,3-Dichloropropane
1,3-Dichloropropene-Cis
1,3-Dichloropropene-Trans
1,4,6,7-Tetramethylnaphthalene
1,4-Dichloro-2-Butene-Cis
1,4-Dichloro-2-Butene-Trans
1,4-Dichlorobenzene
1,4-Dioxane
1,7-Alpha-Ethynylestradiol
1,7-Dimethylfluorene
1,7-Dimethylphenanthrene
1,8-Dimethylphenanthrene
1-Methylchrysene
1-Methylnaphthalene
1-Methylphenanthrene
12,14-Dichlorodehydroabietic Acid
12-Chlorodehydroabietic Acid
14-Chlorodehydroabietic Acid
2,2-Dichlor

In [8]:
# Check those variables with more than one unit
unit_counts = data.groupby('VariableName')['UnitCode'].nunique().reset_index()
pars_with_multiple_units = unit_counts.loc[unit_counts['UnitCode'] > 1, 'VariableName'].tolist()

for i in pars_with_multiple_units:
    print(i)
    print(data[data['VariableName'] == i].groupby('UnitCode')['UnitCode'].count())
    print('-----------------------------------------')

2,3,6-Trichlorophenol (Surrogate)
UnitCode
%       53
ug/l    46
Name: UnitCode, dtype: int64
-----------------------------------------
Coliforms Fecal
UnitCode
cfu/100 ml      134
no/100 ml     21085
Name: UnitCode, dtype: int64
-----------------------------------------
Coliforms Total
UnitCode
mpn/100 ml       3
no/100 ml     3061
Name: UnitCode, dtype: int64
-----------------------------------------
Discharge Daily Mean
UnitCode
cfs     212
m3/s     49
Name: UnitCode, dtype: int64
-----------------------------------------
Discharge Instantaneous
UnitCode
cfs      31
m3/s    219
Name: UnitCode, dtype: int64
-----------------------------------------
Escherichia Coli
UnitCode
cfu/100 ml      405
mpn/100 ml        3
no/100 ml     18098
Name: UnitCode, dtype: int64
-----------------------------------------
Fluorescent Dissolved Organic Matter-Fdom (Field)
UnitCode
ppb qsu    209
rfu        209
Name: UnitCode, dtype: int64
-----------------------------------------
Sampling Distance From L

## 1.4 Handing censored data and adding date stamps

In [9]:
# Add censored column
data['Censored'] = data.apply(
    lambda row: True if row['MeasurementFlag'] == 'L'
    else False if (row['MeasurementFlag'] != 'L' and pd.isna(row['SampleDetectLimit']))
    else row['MeasurementValue'] < row['SampleDetectLimit'], 
    axis=1
)

# Extract year, day, month column from SamepleDateTime:
data['Year'] = data['SampleDateTime'].dt.year
data['Month'] = data['SampleDateTime'].dt.month
data['Day'] = data['SampleDateTime'].dt.day

# Add month names and make them categorical
data['MonthCategory'] = data['SampleDateTime'].dt.month_name()
data['MonthCategory'] = pd.Categorical(data['MonthCategory'],
                                                categories=['January', 'February', 'March', 'April', 'May', 'June',
                                                            'July', 'August', 'September', 'October', 'November', 'December'],
                                                ordered=True)

# Add month short names and make them categorical
data['MonthCategoryShort'] = data['SampleDateTime'].dt.strftime("%b")
data['MonthCategoryShort'] = pd.Categorical(data['MonthCategoryShort'],
                                                categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                                                            'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
                                                ordered=True)

## 1.5 EDA

In [None]:
data_backup = data

In [11]:
data = data_backup.copy()
data.describe()

Unnamed: 0,LatitudeDecimalDegrees,LongitudeDecimalDegrees,SampleDateTime,MeasurementValue,SampleDetectLimit,MethodDetectionLimit,Year,Month,Day
count,2209871.0,2209871.0,2209871,2209871.0,1676151.0,1897307.0,2209871.0,2209871.0,2209871.0
mean,52.52154,-113.6532,2012-09-21 09:48:36.623111680,395.3181,11.246677,13.37832,2012.236,6.401583,14.78128
min,49.02673,-118.8047,1959-02-20 14:00:00,-213.0,0.0,0.0,1959.0,1.0,1.0
25%,50.3531,-114.4871,2007-02-20 14:30:00,0.0391,0.005,0.006,2007.0,4.0,9.0
50%,52.08902,-113.4421,2016-02-08 14:15:00,0.841,0.05,0.05,2016.0,7.0,15.0
75%,54.2012,-112.4759,2020-07-21 10:05:00,13.67,0.3,0.8,2020.0,9.0,20.0
max,58.44722,-110.0297,2024-12-12 14:45:00,1330000.0,80000.0,10000.0,2024.0,12.0,31.0
std,2.512963,1.996642,,6866.319,149.601908,166.8303,10.28575,3.088282,7.027856


## 1.6 Join with site IDs for upstream to downstream plottins

In [12]:
siteIDs = pd.read_csv('../data/site_IDs.csv')
data = data.merge(siteIDs, how='left', on='StationNumber')
data = data.sort_values(by=["Basin", "PLOT_CODE2"])

In [13]:
data['StationNumber'] = pd.Categorical(data['StationNumber'], ordered=True, categories=data['StationNumber'].unique())
data['SITE_ID'] = pd.Categorical(data['SITE_ID'], ordered=True, categories=data['SITE_ID'].unique())
data['PLOT_CODE'] = pd.Categorical(data['PLOT_CODE'], ordered=True, categories=data['PLOT_CODE'].unique())
data['Station'] = pd.Categorical(data['Station'], ordered=True, categories=data['Station'].unique())

# check values to make sure the order is correct:
print(list(data['SITE_ID'].cat.categories))

['SAR', 'MTR', 'AR1', 'AR2', 'BER', 'SAK', 'MCL', 'AR3', 'PBR', 'LSR', 'AR4', 'AR5', 'AR6', 'AR7', 'AR8', 'BTR1', 'BTR2', 'BVR1', 'BVR2', 'BVR3', 'GR', 'WSC', 'JPC', 'BR1', 'ER', 'NC', 'FC', 'PC', 'SR1', 'SR2', 'HR1', 'HR2', 'BR2', 'WAC', 'EAC', 'BR3', 'CFC', 'CC', 'NWC', 'TMC', 'BR4', 'NMR', 'MLK1', 'MLK2', 'RC', 'VC', 'MSC', 'MLK3', 'NSR1', 'SFR', 'CLNR', 'BGHR', 'NSR2', 'RMR', 'NSR3', 'CLWR2', 'CLWR1', 'NSR4', 'BPTR', 'NR', 'BRZR', 'RSC', 'MDC', 'TC', 'STRW', 'WDC', 'CNJC', 'NSR5', 'WMC', 'STUR', 'RDWR', 'NSR6', 'AC', 'VR', 'OMR1', 'PC1', 'PC2', 'BRC', 'WK', 'WNR', 'BYR1', 'BYR2', 'SMR', 'OMR2', 'MQC', 'LBR1', 'LBR2', 'LBR3', 'LBR4', 'OMR3', 'EC', 'WR1', 'WR2', 'SYR', 'PR1', 'PR2', 'PR3', 'FTC', 'RDR1', 'BBC', 'JR', 'RR', 'LRDR', 'MR', 'RDR2', 'WOC', 'BMR', 'RDR3', 'RDR4', 'THC1', 'THC2', 'KC1', 'KC2', 'MC1', 'MC2', 'RBR1', 'RBR2', 'MTC', 'BRYC', 'RDR5', 'SSR-1', 'SPC', 'RK', 'SSR-2']


# Appendix - Sandbox

## A.1 Practicing grouping

In [None]:
df = data.groupby(['Station', 'VariableName'], observed=True).agg({
    'MeasurementValue': ['mean', 'std', 'count'],
    'SampleDateTime': ['min', 'max'],
    'Censored': ['sum']
}).unstack(['Station', 'VariableName']).reset_index()

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeasurementValue,MeasurementValue,MeasurementValue,SampleDateTime,SampleDateTime,Censored
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,min,max,sum
Station,VariableName,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
"Sunwapta River, At Wsc Gauge 0.1 Km D/S Of Sunwapta Lake",Alkalinity Phenolphthalein Caco3,0.857143,0.229175,35,2016-05-11 08:45:00,2024-09-11 12:20:00,35
"Sunwapta River, At Wsc Gauge 0.1 Km D/S Of Sunwapta Lake",Alkalinity Total Caco3,67.0,19.064905,35,2016-05-11 08:45:00,2024-09-11 12:20:00,0
"Sunwapta River, At Wsc Gauge 0.1 Km D/S Of Sunwapta Lake",Aluminum Dissolved,6.470571,3.845316,35,2016-05-11 08:45:00,2024-09-11 12:20:00,0
"Sunwapta River, At Wsc Gauge 0.1 Km D/S Of Sunwapta Lake",Aluminum Total Recoverable,999.762857,1505.076885,35,2016-05-11 08:45:00,2024-09-11 12:20:00,0
"Sunwapta River, At Wsc Gauge 0.1 Km D/S Of Sunwapta Lake",Ammonia Total,0.024,0.016666,35,2016-05-11 08:45:00,2024-09-11 12:20:00,30


## A.2 Filtering

Find out which mainstem stations have selenium total more than its PAL guideline (2 ug/l)

In [15]:
selenium_exceedances = data[
    (data['VariableName'] == 'Selenium Total Recoverable') & 
    (data['MeasurementValue'] > 2) &
    (~ data['Censored']) &
    (data['SITE_TYPE2'] == 'MS') 
]

# Double check the unit
selenium_exceedances['UnitCode'].unique()
selenium_exceedances

Unnamed: 0,ProjectNumber,SampleNumber,ContinentalRiverBasinCode,RiverBasinCode,RiverSubBasinCode,StationTypeCode,StationNumber,Station,LatitudeDecimalDegrees,LongitudeDecimalDegrees,...,Year,Month,Day,MonthCategory,MonthCategoryShort,SITE_TYPE2,Basin,SITE_ID,PLOT_CODE,PLOT_CODE2
1741470,ABSB34,11SWE00201,SAS,BAT,05FA,0,AB05FA0060,"Battle River, Approx 2 Km D/S Hwy 53",52.65881,-113.67508,...,2011,1,18,January,Jan,MS,Battle River,BTR1,N4,4
1741635,ABSB34,09SWE00024,SAS,BAT,05FA,0,AB05FA0340,"Battle River, At North End Of Driedmeat Lake",52.93736,-112.84861,...,2009,1,15,January,Jan,MS,Battle River,BTR2,N6,6
1741639,ABSB34,10SWE00035,SAS,BAT,05FA,0,AB05FA0340,"Battle River, At North End Of Driedmeat Lake",52.93736,-112.84861,...,2010,1,20,January,Jan,MS,Battle River,BTR2,N6,6
1745831,ABSM34,17SWL10204,MIS,MIL,11AA,0,AB11AA0070,"Milk River, At Hwy 880",49.14541,-111.30734,...,2017,2,13,February,Feb,MS,Milk River,MLK3,M7,7
1748247,ABSP34,17SWG00020,MAC,PEA,07FD,0,AB07FD0135,"Peace River, U/S Smoky River Near Shaftesbury ...",56.09319,-117.56608,...,2017,5,15,May,May,MS,Peace River,PR1,P4,4
1748282,ABSP34,20SWG00016,MAC,PEA,07FD,0,AB07FD0135,"Peace River, U/S Smoky River Near Shaftesbury ...",56.09319,-117.56608,...,2020,6,16,June,Jun,MS,Peace River,PR1,P4,4
1748003,ABSP34,04SWE01259,MAC,PEA,07HF,0,AB07HF0010,"Peace River, At Fort Vermilion - Centre",58.40444,-116.12806,...,2004,7,15,July,Jul,MS,Peace River,PR3,P6,6
1749274,ABSR34,23SWE11846,SAS,RED,05CD,0,AB05CD0375,"Red Deer River, At Morrin Bridge - Transect",51.65056,-112.90306,...,2023,6,19,June,Jun,MS,Red Deer River,RDR4,R12,12
1749737,ABSR34,20SWC80604,SAS,RED,05CJ,0,AB05CJ0070,"Red Deer River, D/S Dinosaur Prov Park At Hwy ...",50.83861,-111.17667,...,2020,6,25,June,Jun,MS,Red Deer River,RDR5,R23,23
1750899,ABSS34,22SWC20606,SAS,SSA,05AJ,0,AB05AJ0060,"South Saskatchewan River, Above Medicine Hat",50.04292,-110.72608,...,2022,6,16,June,Jun,MS,South Saskatchewan River,SSR-1,O17,17


Now find how many samples in each region exceeded the PAL guideline:/

In [16]:
# group selenium_exceedances by Region and paste unique values of Station for each Region
selenium_exceedances.groupby(['Basin'])['SampleNumber'].nunique().reset_index()

Unnamed: 0,Basin,SampleNumber
0,Battle River,3
1,Milk River,1
2,Peace River,3
3,Red Deer River,2
4,South Saskatchewan River,1


## A.3 Transforming

Examine merucry data and create sum of ranks for each station based on each basin.

In [17]:
mercury_data = data[
    (data['VariableName'] == "Mercury Total") &
    (~ data['Censored']) 
    ].copy()

# checking the unit
mercury_data['UnitCode'].unique()

array(['ng/l'], dtype=object)

In [18]:
mercury_data['rank'] = mercury_data.groupby('Basin')['MeasurementValue'].transform('rank', method='dense', ascending=False)
mercury_data['rank_mean'] = mercury_data.groupby(['Basin', 'Station'], observed=True)['rank'].transform('mean')
mercury_data[['Station', 'rank_mean']].drop_duplicates().sort_values(by='rank_mean', ascending=True)

Unnamed: 0,Station,rank_mean
1221258,"Battle River, At North End Of Driedmeat Lake",91.530864
1229973,"Michichi Creek, Near The Mouth",100.481481
1221285,"Beaver River, At Highway 892 Bridge North Of A...",106.923077
1235834,"Verdigris Creek, At Hwy 501",108.000000
1226242,"Beaver River, At Gravel Pit, 6 Km U/S Of Ab-Sk...",108.778846
...,...,...
1226825,"Berry Creek, Near Mouth",682.792208
1229766,"Matzhiwin Creek, At Hwy 36",688.714286
1221713,"Red Deer River, 1 Km U/S Hwy 2 Bridge",690.052326
1233776,"Red Deer River, At Sundre",731.562044


The output from chunk above shows a risk indicator (the lower, the higher risk) for each region. 

## A.4 Pivot, Stack, Unstack

This will be beneficial for calculating guidelines that are modified by toxicity factors. An example is total ammonia where its guideline value depends on pH and 

In [64]:
data_pivot = data[
    (data['VariableName'].isin(["Ph (Field)", "Temperature Water"])) &
    (~ data['Censored'])][['SampleNumber', 'VariableName', 'MeasurementValue']].reset_index(drop=True).copy()

# Group by and summarize to avoid duplicate sample measurements:
data_pivot = data_pivot.groupby(['SampleNumber', 'VariableName'], as_index=True, observed=True).agg({'MeasurementValue': 'mean'}).\
    reset_index(names = ['SampleNumber', 'VariableName', 'MeasurementValue'])

# Now pivot the data
data_pivot = data_pivot.pivot(index='SampleNumber', columns='VariableName', values='MeasurementValue')#.reset_index()

# Filter pH and temperature based on Alberta guidelines table (2018 document):
data_pivot = data_pivot[
    (data_pivot['Ph (Field)'] >= 6) & 
    (data_pivot['Ph (Field)'] <= 10) & 
    (data_pivot['Temperature Water'] >= 0) & 
    (data_pivot['Temperature Water'] <= 30)                    
                        ]

# Add the guideline column now! use this formula: 0.019/(1/(10**(0.0901821 + 2729.92/(Temperature Water + 273.15) - pH Field) + 1))*0.8224
data_pivot["Guideline_AmmoniaT_PAL"] = 0.019 / (
    1 / (10 ** (0.0901821 + 2729.92 / (data_pivot["Temperature Water"] + 273.15) - data_pivot["Ph (Field)"]) + 1)
    ) * 0.8224
data_pivot

VariableName,Ph (Field),Temperature Water,Guideline_AmmoniaT_PAL
SampleNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
03SWE00753,8.72,8.90,0.190558
03SWE01033,8.16,13.69,0.453391
03SWE01309,8.15,13.13,0.483208
03SWE01533,8.01,9.26,0.887657
03SWE02079,7.99,9.06,0.943272
...,...,...,...
99SWL10610,8.50,13.32,0.221467
99SWL10710,8.41,15.71,0.226822
99SWL10810,8.50,21.60,0.126756
99SWL10910,8.42,12.93,0.270616


In [63]:
data_pivot.index

Index([    0,     1,     2,     3,     4,     7,    10,    11,    12,    13,
       ...
       23157, 23158, 23159, 23161, 23162, 23163, 23164, 23165, 23166, 23167],
      dtype='int64', length=18619)

Alternatively, use pivot_table. I've added station number just to practice stacking. 

In [65]:
data_pivot_table = data[
    (data['VariableName'].isin(["Ph (Field)", "Temperature Water"])) &
    (~ data['Censored'])][['StationNumber', 'SampleNumber', 'VariableName', 'MeasurementValue']].copy()

# Now use pivot table instead. I know using stack here is redundant. But it's still practice.
data_pivot_table = data_pivot_table.pivot_table(columns=['StationNumber', 'VariableName'], 
                                                values='MeasurementValue', 
                                                index='SampleNumber', 
                                                observed=True, 
                                                aggfunc="mean")

# Stack
data_pivot_table = data_pivot_table.stack(level=['StationNumber'])

# Now stack the data. Looks like the way the table is shown is been renewed. 
data_pivot_table.head()

  data_pivot_table = data_pivot_table.stack(level=['StationNumber'])


Unnamed: 0_level_0,VariableName,Ph (Field),Temperature Water
SampleNumber,StationNumber,Unnamed: 2_level_1,Unnamed: 3_level_1
03SWE00753,AB07AD0100,8.72,8.9
03SWE01033,AB07AD0100,8.16,13.69
03SWE01309,AB07AD0100,8.15,13.13
03SWE01533,AB07AD0100,8.01,9.26
03SWE02079,AB07AD0100,7.99,9.06


Now unstack:

In [66]:
data_pivot_table_unstack = data_pivot_table.unstack(level=['StationNumber'])
data_pivot_table_unstack.head()

VariableName,Ph (Field),Ph (Field),Ph (Field),Ph (Field),Ph (Field),Ph (Field),Ph (Field),Ph (Field),Ph (Field),Ph (Field),...,Temperature Water,Temperature Water,Temperature Water,Temperature Water,Temperature Water,Temperature Water,Temperature Water,Temperature Water,Temperature Water,Temperature Water
StationNumber,AB07AA0005,AB07AA0007,AB07AD0100,AB07AD0110,AB07AC0015,AB07AH0005,AB07AG0345,AB07BD0010,AB07BC0025,AB07BK0125,...,AB05CE0700,AB05CE0090,AB05CE0100,AB05CJ0030,AB05CH0120,AB05CJ0070,AB05AJ0060,AB05AH0050,AB05AH0020,AB05AK0990
SampleNumber,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
03SWE00753,,,8.72,,,,,,,,...,,,,,,,,,,
03SWE01033,,,8.16,,,,,,,,...,,,,,,,,,,
03SWE01309,,,8.15,,,,,,,,...,,,,,,,,,,
03SWE01533,,,8.01,,,,,,,,...,,,,,,,,,,
03SWE02079,,,7.99,,,,,,,,...,,,,,,,,,,


## A.4.1 Melt

In [73]:
data_pivot.reset_index().melt(id_vars="SampleNumber", var_name="VariableName", value_name="MeasurementValue")

Unnamed: 0,SampleNumber,VariableName,MeasurementValue
0,03SWE00753,Ph (Field),8.720000
1,03SWE01033,Ph (Field),8.160000
2,03SWE01309,Ph (Field),8.150000
3,03SWE01533,Ph (Field),8.010000
4,03SWE02079,Ph (Field),7.990000
...,...,...,...
55843,99SWL10610,Guideline_AmmoniaT_PAL,0.221467
55844,99SWL10710,Guideline_AmmoniaT_PAL,0.226822
55845,99SWL10810,Guideline_AmmoniaT_PAL,0.126756
55846,99SWL10910,Guideline_AmmoniaT_PAL,0.270616


cross-tab:

In [78]:
unique_stations = data[['Basin', 'SITE_TYPE2', 'StationNumber']].drop_duplicates()
pd.crosstab(unique_stations['Basin'], unique_stations['SITE_TYPE2'])

SITE_TYPE2,MS,TRIB
Basin,Unnamed: 1_level_1,Unnamed: 2_level_1
Athabasca River,8,7
Battle River,2,0
Beaver River,3,0
Bow River,4,17
Milk River,3,4
North Saskatchewan River,5,21
Oldman River,3,14
Peace River,3,3
Red Deer River,5,18
South Saskatchewan River,2,2


qcut:

In [81]:
pd.qcut(mercury_data['MeasurementValue'], q = 4)

1235574      (0.69, 1.23]
1235575      (1.23, 2.72]
1235576      (0.69, 1.23]
1235577     (0.059, 0.69]
1235578      (1.23, 2.72]
                ...      
1235338      (0.69, 1.23]
1235339    (2.72, 7500.0]
1235340      (1.23, 2.72]
1235341      (0.69, 1.23]
1235342      (0.69, 1.23]
Name: MeasurementValue, Length: 12041, dtype: category
Categories (4, interval[float64, right]): [(0.059, 0.69] < (0.69, 1.23] < (1.23, 2.72] < (2.72, 7500.0]]