# Water Usage Analysis with SAS

## Download the and preview the xls data

In [2]:
/* Reference the data from GitHub */
filename xlfile URL 'https://github.com/pestyld/data_projects/raw/master/water_usage_analysis/data/AMI_METER_READS-METER_INFO_HOURLY.xls';

/* Change column names to valid values */
options validvarname=v7;

/* download and import the XLS file */
proc import datafile=xlfile
            dbms=xls 
            out=work.water_usage 
            replace;
run;

/* Preview the data */
proc print data=water_usage(obs=5);
run;

/* View column metadata */
ods select variables;
proc contents data=water_usage;
run;

Obs,Service,Read_Date_Time,Usage__in_Gallons
1,Water,11/15/23 12:00 AM,0
2,Water,11/14/23 11:00 PM,0
3,Water,11/14/23 10:00 PM,0
4,Water,11/14/23 9:00 PM,0
5,Water,11/14/23 8:00 PM,0
6,Water,11/14/23 7:00 PM,0
7,Water,11/14/23 6:00 PM,0
8,Water,11/14/23 5:00 PM,0
9,Water,11/14/23 4:00 PM,0
10,Water,11/14/23 3:00 PM,0

Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes
#,Variable,Type,Len,Format,Informat,Label
2,Read_Date_Time,Char,17,$17.,$17.,Read Date/Time
1,Service,Char,12,$12.,$12.,Service
3,Usage__in_Gallons,Char,17,$17.,$17.,Usage in Gallons


## Prepare Data - Create Final Hourly Data
- Modify the **Read_Date_Time** character column to a valid date value
- Rename Usage in gallons column, convert to numeric
- Create **Date** column
- Create **Time** column
- Format all columns accordingly
- Add labels
- Drop unnecessary columns

In [4]:
data water_clean;
    set water_usage (rename=(Usage__in_Gallons = Usage_in_Gallons_char)); /* Rename usage column to char to replace later */
    
    /* Convert usage_in_gallons to numeric */
    usage_in_gallons = input(Usage_in_Gallons_char, 8.);
    
    /* Convert read_date to numeric */
    read_date = input(Read_Date_Time, mdyampm23.);
    
    /* Create some date columns */
    Date = datepart(read_date);
    Time = timepart(read_date);
    Month = Date;
    Year = year(Date);
    MonthYear = Date;
    
    /* Format columns */
    format 
        read_date mdyampm23.
        Date date9.
        Time timeampm.
        Month monname.
        MonthYear monyy7.
        usage_in_gallons comma15.
    ;
    
    /* Labels */
    label
        read_date = 'Read Date'
        usage_in_gallons = 'Total Gallons'
    ;
    
    /* Drop columns */
    drop 
        Service 
        Read_Date_Time
        Usage_in_Gallons_char
    ;
run;

proc print data=water_clean(obs=5);
run;


/* View column metadata */
ods select variables;
proc contents data=water_clean;
run;

Obs,usage_in_gallons,read_date,Date,Time,Month,Year,MonthYear
1,0,11/15/2023 12:00 AM,15NOV2023,12:00:00 AM,November,2023,NOV2023
2,0,11/14/2023 11:00 PM,14NOV2023,11:00:00 PM,November,2023,NOV2023
3,0,11/14/2023 10:00 PM,14NOV2023,10:00:00 PM,November,2023,NOV2023
4,0,11/14/2023 9:00 PM,14NOV2023,9:00:00 PM,November,2023,NOV2023
5,0,11/14/2023 8:00 PM,14NOV2023,8:00:00 PM,November,2023,NOV2023

Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes
#,Variable,Type,Len,Format,Label
3,Date,Num,8,DATE9.,
5,Month,Num,8,MONNAME.,
7,MonthYear,Num,8,MONYY7.,
4,Time,Num,8,TIMEAMPM.,
6,Year,Num,8,,
2,read_date,Num,8,MDYAMPM23.,Read Date
1,usage_in_gallons,Num,8,COMMA15.,Total Gallons


## Explore Data

### Overall min, mean and max of hourly data

In [5]:
proc means data=water_clean noprint;
    var usage_in_gallons date;
    output out=data_summary(drop=_TYPE_) 
        max(usage_in_gallons)=MaxGal mean(usage_in_gallons)=MeanGal min(usage_in_gallons)=MinGal4
        max(date)=MaxDate min(date)=MinDate
        ;
run;

title "Total obs, Max, Mean and Min usage and date by Hour";
proc print data=data_summary;
run;
title;

Obs,_FREQ_,MaxGal,MeanGal,MinGal4,MaxDate,MinDate
1,9796,290,5,0,15NOV2023,01OCT2022


## Visualizations

### 1. Analyzing Water Usage Monthly

In [7]:
proc print data=water_clean(obs=5);
run;

Obs,usage_in_gallons,read_date,Date,Time,Month,Year,MonthYear
1,0,11/15/2023 12:00 AM,15NOV2023,12:00:00 AM,November,2023,NOV2023
2,0,11/14/2023 11:00 PM,14NOV2023,11:00:00 PM,November,2023,NOV2023
3,0,11/14/2023 10:00 PM,14NOV2023,10:00:00 PM,November,2023,NOV2023
4,0,11/14/2023 9:00 PM,14NOV2023,9:00:00 PM,November,2023,NOV2023
5,0,11/14/2023 8:00 PM,14NOV2023,8:00:00 PM,November,2023,NOV2023


#### Data Prep

In [173]:
/* Create monthly/year summary table */
ods output Summary=monthly_summary;
proc means data=water_clean n sum;
    var usage_in_gallons;
    class MonthYear;
run;


data monthly_summary;
    length MeterStatus $7;
    set monthly_summary;
    /* Identify broken meter months */
    if MonthYear < '01AUG2023'd then MeterStatus = 'Working';
    else MeterStatus = 'Broken';
    
    /* Avg water usage per day in a month */
    num_days_in_month = day(intnx('month',MonthYear, 0,'end'));
    avg_gallons_per_day = round(usage_in_gallons_Sum / num_days_in_month);
    
    /* Find number of days in the month */
    if MeterStatus='Broken' then do;
        usage_in_gallons_broken = usage_in_gallons_Sum;
        usage_in_gallons_avg_broken = avg_gallons_per_day;
    end;
    else do;
        usage_in_gallons_sum_labels = usage_in_gallons_Sum;
        usage_in_gallons_avg_labels = avg_gallons_per_day;
    end;
    
    /* Format the columns */
    format usage_in_gallons_Sum usage_in_gallons_sum_labels comma16.
           MonthYear monyy7.
    ;
run;

proc print data=monthly_summary;
run;

Analysis Variable : usage_in_gallons Total Gallons,Analysis Variable : usage_in_gallons Total Gallons,Analysis Variable : usage_in_gallons Total Gallons,Analysis Variable : usage_in_gallons Total Gallons
MonthYear,N Obs,N,Sum
OCT22,743,743,5720.0
NOV22,720,720,5000.0
DEC22,744,744,3840.0
JAN23,744,744,4550.0
FEB23,672,672,3890.0
MAR23,743,743,4050.0
APR23,720,720,5280.0
MAY23,744,744,3820.0
JUN23,720,720,3420.0
JUL23,744,744,4650.0

Obs,MeterStatus,MonthYear,NObs,usage_in_gallons_N,usage_in_gallons_Sum,num_days_in_month,avg_gallons_per_day,usage_in_gallons_broken,usage_in_gallons_avg_broken,usage_in_gallons_sum_labels,usage_in_gallons_avg_labels
1,Working,OCT2022,743,743,5720,31,185,.,.,5720,185
2,Working,NOV2022,720,720,5000,30,167,.,.,5000,167
3,Working,DEC2022,744,744,3840,31,124,.,.,3840,124
4,Working,JAN2023,744,744,4550,31,147,.,.,4550,147
5,Working,FEB2023,672,672,3890,28,139,.,.,3890,139
6,Working,MAR2023,743,743,4050,31,131,.,.,4050,131
7,Working,APR2023,720,720,5280,30,176,.,.,5280,176
8,Working,MAY2023,744,744,3820,31,123,.,.,3820,123
9,Working,JUN2023,720,720,3420,30,114,.,.,3420,114
10,Working,JUL2023,744,744,4650,31,150,.,.,4650,150


#### Default Visual

In [210]:
title "DEFAULT GRAPH: Monthly Water Useage from October 2022 to November 2023";
proc sgplot data=monthly_summary;
    vline MonthYear / response=usage_in_gallons_Sum;
    vline MonthYear / response=avg_gallons_per_day;
run;
title;

#### Final Visual

Create macro variables for specific settings

In [167]:
/* Set default colors */
%let textColor = CX3D444F;
%let myBlue = CX0766D1;
%let lightGray = CXC1C7C9;


/* Create max y value for the graph by increasing the max value by %25  and rounding to the nearest 1,000 */
proc sql noprint;
    select round(max(usage_in_gallons_Sum) * 1.25, 1000)
        into :maxYValue trimmed
        from monthly_summary
quit;
%put &=maxYValue;


339                                                        The SAS System                          09:09 Thursday, November 23, 2023

6611       ods listing close;ods html5 (id=saspy_internal) file=_tomods1 options(bitmap_mode='inline') device=svg style=HTMLBlue;
6611     ! ods graphics on / outputfmt=png;
[38;5;21mNOTE: Writing HTML5(SASPY_INTERNAL) Body file: _TOMODS1[0m
6612       
6613       /* Set default colors */
6614       %let textColor = CX3D444F;
6615       %let myBlue = CX0766D1;
6616       %let lightGray = CXC1C7C9;
6617       
6618       
6619       /* Create max y value for the graph by increasing the max value by %25  and rounding to the nearest 1,000 */
6620       proc sql noprint;
6621           select round(max(usage_in_gallons_Sum) * 1.25, 1000)
6622               into :maxYValue trimmed
6623               from monthly_summary
6624       quit;
6625       %put &=maxYValue;
MAXYVALUE=7000
6626       
6627       
6628       ods html5 (id=saspy_internal) close;ods li

Create my annotation table to add annotations to the visual.

In [168]:
/* Import the annotation macros */
%SGANNO

/* Create annotation data set for the graph */
data myAnno;
    /* 2022 and 2023 labels */
    %sgtext(drawspace='DATAVALUE',x1='01Oct2022'd, y1=2, label="2022", width = 10, justify="left", textcolor = "&lightGray", textSize=16, anchor='bottomleft');
    %sgtext(drawspace='DATAVALUE',x1='01Jan2023'd, y1=2, label="2023", width = 10, justify="left", textcolor = "&lightGray", textSize=16, anchor='bottomleft');
    /* Bad water meter text and shading */
    %sgtext(drawspace='DATAVALUE', x1='01Aug2023'd, y1=&maxYValue, label="Our home water meter broke on August 19, 2023, and has not been repaired.", width = 25, justify="center", 
            textcolor = "&textColor", textSize=11, anchor='topleft', discreteoffset=+.15);
    %sgrectangle(drawspace='datavalue', 
                 x1='01Aug2023'd , widthunit='data', width='01Oct2023'd,
                 y1=0, heightunit='data', height=&maxYValue,
                 display = 'fill', filltransparency=.95, fillcolor='red', anchor='bottomleft',reset='all');
run;

/* View the data */
proc print data=myAnno;
run;

Obs,ANCHOR,DISPLAY,DRAWSPACE,FILLCOLOR,FUNCTION,HEIGHTUNIT,JUSTIFY,LABEL,TEXTCOLOR,WIDTHUNIT,TEXTSIZE,WIDTH,X1,Y1,DISCRETEOFFSET,FILLTRANSPARENCY,HEIGHT
1,bottomleft,,DATAVALUE,,TEXT,,left,2022,CXC1C7C9,,16,10,22919,2,.,.,.
2,bottomleft,,DATAVALUE,,TEXT,,left,2023,CXC1C7C9,,16,10,23011,2,.,.,.
3,topleft,,DATAVALUE,,TEXT,,center,"Our home water meter broke on August 19, 2023, and has not been repaired.",CX3D444F,,11,25,23223,7000,0.15,.,.
4,bottomleft,fill,datavalue,red,RECTANGLE,data,,,,data,.,23284,23223,0,.,0.95,7000


Create final visualization

In [208]:
title justify = left color = &textColor height=14pt "Flowing Through Time: A Visual Journey of My Family's Monthly Water Usage (Gallons)";
title2 justify = left color = &textColor height=12pt  "October 2022 - November 2023";
footnote justify = left color = &textColor height=8pt italic "Total is total usage per month, Avg is average usage per month";
footnote2 justify = left color = &textColor height=8pt italic "Created on November 11, 2023";

ods graphics / width = 9in height = 4in;
proc sgplot data = monthly_summary
            noborder 
            nowall
            sganno = myAnno
            noautolegend;
    /* Refline for new year */
    refline 'Jan2023' / axis=x labelpos=min labelloc=inside lineattrs=(color=lightgray);
    
    /* Sum gallons lines (working and broken) */
    vline MonthYear / 
        response=usage_in_gallons_Sum
        lineattrs=(thickness=3 color=&myBlue)
        markers markerattrs=(symbol=CircleFilled size=10 color=&myBlue)
        datalabel=usage_in_gallons_sum_labels datalabelattrs=(color=&textColor)
        dataskin=none 
        curvelabel='Total' curvelabelpos=min curvelabelattrs=(color=&myBlue)
    ;
    vline MonthYear /
        response=usage_in_gallons_broken
        lineattrs=(thickness=3 color=red)
        markers markerattrs=(symbol=CircleFilled size=10)
        dataskin=none
    ;
    /* Avg gallons lines (working and broken) */
    vline MonthYear / 
        response=avg_gallons_per_day 
        y2axis
        lineattrs=(color=&myBlue)
        datalabel=usage_in_gallons_avg_labels datalabelattrs=(color=&textColor)
        dataskin=none
        markers markerattrs=(color=&myBlue symbol=CircleFilled size=6)
        curvelabel='Avg' curvelabelpos=min curvelabelattrs=(color=&myBlue)
    ;
    vline MonthYear /
        response=usage_in_gallons_avg_broken
        y2axis
        lineattrs=(color=red)
        markers markerattrs=(color=red symbol=CircleFilled size=6)
        dataskin=none
    
    ;
    /* Axis attributes */
    xaxis display=(NOLABEL NOTICKS)
          valueattrs=(color=&textColor size=10pt);
    yaxis display=NONE 
          offsetmin=0
          max=&maxYValue
          label='' 
          labelattrs=(color=&textColor size=12pt);
    
    y2axis display=NONE 
          offsetmin=0
          max=800
          labelattrs=(color=&textColor size=12pt);
run;
title;
ods graphics / reset;

title;
footnote;

### Analyzing Water Usage by Hour

In [264]:
proc print data=water_clean(obs=5);
run;

Obs,usage_in_gallons,read_date,Date,Time,Month,MonthYear
1,0,11/15/2023 12:00 AM,15NOV2023,12:00:00 AM,November,NOV2023
2,0,11/14/2023 11:00 PM,14NOV2023,11:00:00 PM,November,NOV2023
3,0,11/14/2023 10:00 PM,14NOV2023,10:00:00 PM,November,NOV2023
4,0,11/14/2023 9:00 PM,14NOV2023,9:00:00 PM,November,NOV2023
5,0,11/14/2023 8:00 PM,14NOV2023,8:00:00 PM,November,NOV2023


In [116]:
proc sgplot data = water_clean;
    by MonthYear notsorted;
    vbar Time / 
        response = usage_in_gallons
    ;
run;

In [125]:
ods graphics / width = 8in height = 4in;
options nobyline;

title justify = left height = 16pt color=&textColor "#BYVAL1 - Total Water Usage in Gallons by Hour";

proc sgplot data = water_clean
            noborder
            nowall;
    by MonthYear notsorted;
    vbar Time / 
        response = usage_in_gallons
        stat=sum
        nooutline
        barwidth = .5
        fillattrs = (color = &myBLue)
        datalabels
    ;
    yaxis max = 1250 display=NONE;
    xaxis display=(nolabel);
run;

options byline;
ods graphics / reset;

In [54]:
proc means data=water_clean sum max mean min;
    var usage_in_gallons;
    class Time;
run;

Analysis Variable : usage_in_gallons,Analysis Variable : usage_in_gallons,Analysis Variable : usage_in_gallons,Analysis Variable : usage_in_gallons,Analysis Variable : usage_in_gallons,Analysis Variable : usage_in_gallons
Time,N Obs,Sum,Maximum,Mean,Minimum
12:00:00 AM,409,610.0,60.0,1.4914425,0
1:00:00 AM,408,350.0,30.0,0.8578431,0
2:00:00 AM,407,50.0,10.0,0.1228501,0
3:00:00 AM,408,40.0,10.0,0.0980392,0
4:00:00 AM,408,20.0,10.0,0.0490196,0
5:00:00 AM,408,80.0,20.0,0.1960784,0
6:00:00 AM,408,250.0,30.0,0.6127451,0
7:00:00 AM,408,430.0,50.0,1.0539216,0
8:00:00 AM,408,1290.0,40.0,3.1617647,0
9:00:00 AM,408,2250.0,70.0,5.5147059,0


### Total gallons used per month and year

In [47]:
ods output Summary=test;
proc means data=water_clean sum n noprint;
    var usage_in_gallons;
    class date;
    output out=monthly_summary;
    format date monyy.;
run;

proc print data=test;
run;

Obs,Date,NObs,usage_in_gallons_Sum,usage_in_gallons_N
1,OCT22,743,5720,743
2,NOV22,720,5000,720
3,DEC22,744,3840,744
4,JAN23,744,4550,744
5,FEB23,672,3890,672
6,MAR23,743,4050,743
7,APR23,720,5280,720
8,MAY23,744,3820,744
9,JUN23,720,3420,720
10,JUL23,744,4650,744
