In [1]:
# Load dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sts
import numpy as np
from sqlalchemy import create_engine
engine = create_engine("sqlite:///master_db.sqlite",echo=False)

In [2]:
exhist = pd.read_sql_query('select * from exhist',con=engine)

In [3]:
exhist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689982 entries, 0 to 689981
Data columns (total 13 columns):
extract_point            689982 non-null object
extraction_tons          689982 non-null int64
datestring               689982 non-null int64
total_extraction_tons    689982 non-null int64
reserves                 689982 non-null float64
region                   689982 non-null object
avg_sA                   689982 non-null float64
exhaustion               689982 non-null float64
zone                     689982 non-null object
sector                   689982 non-null object
daily_total_extracted    689982 non-null int64
ep_daily_pct             689982 non-null float64
ep_datestring            689982 non-null object
dtypes: float64(4), int64(4), object(5)
memory usage: 68.4+ MB


In [4]:
date_month = []
date_year = []
datestrings = exhist.datestring.astype(str)
for i in range(len(datestrings)):
    date_month.append(datestrings[i][4:6])
    date_year.append(datestrings[i][:4])
exhist['month'] = date_month
exhist['year'] = date_year
exhist['year_month'] = exhist['year']+exhist['month']
exhist.head()

Unnamed: 0,extract_point,extraction_tons,datestring,total_extraction_tons,reserves,region,avg_sA,exhaustion,zone,sector,daily_total_extracted,ep_daily_pct,ep_datestring,month,year,year_month
0,611703,405,20151231,405,76860.0,7700SW,0.202857,0.526932,611,703,34288,1.181171,61170320151231,12,2015,201512
1,611703,424,20160105,829,76860.0,7700SW,0.202857,1.078584,611,703,28457,1.489967,61170320160105,1,2016,201601
2,611703,463,20160128,1292,76860.0,7700SW,0.202857,1.680978,611,703,22431,2.064108,61170320160128,1,2016,201601
3,611703,81,20160223,1373,76860.0,7700SW,0.202857,1.786365,611,703,20923,0.387134,61170320160223,2,2016,201602
4,611703,54,20160224,1427,76860.0,7700SW,0.202857,1.856622,611,703,20941,0.257867,61170320160224,2,2016,201602


In [5]:
ep_region = []
for i in range(len(exhist)):
    if exhist['region'][i] == '7210A':
        ep_region.append('rU')
    elif exhist['region'][i] == '7210M':
        ep_region.append('rV')
    elif exhist['region'][i] == '7210N':
        ep_region.append('rW')
    elif exhist['region'][i] == '7700NW':
        ep_region.append('rY')
    elif exhist['region'][i] == '7700SW':
        ep_region.append('rZ')
    else:
        ep_region.append('rX')
len(ep_region)

689982

In [6]:
exhist['ep_region'] = ep_region
exhist.head()

Unnamed: 0,extract_point,extraction_tons,datestring,total_extraction_tons,reserves,region,avg_sA,exhaustion,zone,sector,daily_total_extracted,ep_daily_pct,ep_datestring,month,year,year_month,ep_region
0,611703,405,20151231,405,76860.0,7700SW,0.202857,0.526932,611,703,34288,1.181171,61170320151231,12,2015,201512,rZ
1,611703,424,20160105,829,76860.0,7700SW,0.202857,1.078584,611,703,28457,1.489967,61170320160105,1,2016,201601,rZ
2,611703,463,20160128,1292,76860.0,7700SW,0.202857,1.680978,611,703,22431,2.064108,61170320160128,1,2016,201601,rZ
3,611703,81,20160223,1373,76860.0,7700SW,0.202857,1.786365,611,703,20923,0.387134,61170320160223,2,2016,201602,rZ
4,611703,54,20160224,1427,76860.0,7700SW,0.202857,1.856622,611,703,20941,0.257867,61170320160224,2,2016,201602,rZ


In [7]:
# adding wtd averages for summations in grouping
exhist['average_exhaustion'] = exhist['exhaustion']*exhist['extraction_tons']/exhist['daily_total_extracted']
exhist['average_sA'] = exhist['avg_sA']*exhist['extraction_tons']/exhist['daily_total_extracted']

In [8]:
def ep_daily_agg(x):
    column_names = {
        'total_extracted': x['extraction_tons'].sum(),
        'number_extract_points': x['ep_datestring'].count(),
        'average_exhaustion': x['average_exhaustion'].sum(),
        'average_sA': x['average_sA'].sum(),
        'average_percent_extracted_per_point': x['ep_daily_pct'].mean()
    }
    return pd.Series(column_names,index=['total_extracted','number_extract_points','average_exhaustion','average_sA','average_percent_extracted_per_point'])
daily_df = exhist.groupby('datestring').apply(ep_daily_agg).reset_index(drop=False)
daily_df

Unnamed: 0,datestring,total_extracted,number_extract_points,average_exhaustion,average_sA,average_percent_extracted_per_point
0,20010602,3050.0,4.0,0.763969,0.198464,25.000000
1,20010604,23217.0,41.0,0.810689,0.177191,2.439024
2,20010605,46802.0,58.0,1.398749,0.209976,1.724138
3,20010606,45967.0,54.0,1.185720,0.220088,1.851852
4,20010607,7310.0,20.0,0.554372,0.168266,5.000000
...,...,...,...,...,...,...
4856,20200401,24015.0,201.0,88.742560,0.185467,0.497512
4857,20200402,20823.0,191.0,88.294063,0.187480,0.523560
4858,20200403,22208.0,183.0,87.768005,0.190363,0.546448
4859,20200406,21967.0,177.0,87.789035,0.186220,0.564972


In [9]:
date_month2 = []
date_year2 = []
datestrings2 = daily_df.datestring.astype(str)
for i in range(len(datestrings2)):
    date_month2.append(datestrings2[i][4:6])
    date_year2.append(datestrings2[i][:4])
daily_df['month'] = date_month2
daily_df['year'] = date_year2
daily_df['year_month'] = daily_df['year']+daily_df['month']
daily_df.head()

Unnamed: 0,datestring,total_extracted,number_extract_points,average_exhaustion,average_sA,average_percent_extracted_per_point,month,year,year_month
0,20010602,3050.0,4.0,0.763969,0.198464,25.0,6,2001,200106
1,20010604,23217.0,41.0,0.810689,0.177191,2.439024,6,2001,200106
2,20010605,46802.0,58.0,1.398749,0.209976,1.724138,6,2001,200106
3,20010606,45967.0,54.0,1.18572,0.220088,1.851852,6,2001,200106
4,20010607,7310.0,20.0,0.554372,0.168266,5.0,6,2001,200106


In [23]:
monthly_df = daily_df.groupby('year_month').mean()
monthly_df = monthly_df.drop(columns={'datestring'})
monthly_df

Unnamed: 0_level_0,total_extracted,number_extract_points,average_exhaustion,average_sA,average_percent_extracted_per_point
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
200106,24282.000000,32.823529,2.263424,0.202696,6.190463
200107,23310.875000,29.437500,4.863413,0.200524,17.024241
200108,22716.625000,58.750000,8.820039,0.193687,3.122817
200109,21996.894737,77.210526,12.339667,0.199420,1.460313
200110,21640.130435,76.304348,15.530007,0.208329,1.403304
...,...,...,...,...,...
201912,13234.357143,94.857143,98.231137,0.184536,1.208324
202001,20024.200000,114.266667,95.303671,0.182888,0.895536
202002,19642.714286,121.928571,99.560971,0.174601,0.927806
202003,23802.000000,145.750000,95.019486,0.183683,0.707284


In [24]:
run_days = exhist.groupby('year_month')['datestring'].nunique()
monthly_df['run_days'] = run_days
monthly_df['average_extracted_per_day'] = (monthly_df['total_extracted']/monthly_df['run_days'])
monthly_df = monthly_df.round({'total_extracted':0,'number_extract_points':0,'average_exhaustion':2,
                              'average_sA':3,'average_percent_extracted_per_point':2,
                              'average_extracted_per_day':0})
monthly_df.reset_index(level=0,inplace=True)
monthly_df

Unnamed: 0,year_month,total_extracted,number_extract_points,average_exhaustion,average_sA,average_percent_extracted_per_point,run_days,average_extracted_per_day
0,200106,24282.0,33.0,2.26,0.203,6.19,17,1428.0
1,200107,23311.0,29.0,4.86,0.201,17.02,16,1457.0
2,200108,22717.0,59.0,8.82,0.194,3.12,16,1420.0
3,200109,21997.0,77.0,12.34,0.199,1.46,19,1158.0
4,200110,21640.0,76.0,15.53,0.208,1.40,23,941.0
...,...,...,...,...,...,...,...,...
222,201912,13234.0,95.0,98.23,0.185,1.21,14,945.0
223,202001,20024.0,114.0,95.30,0.183,0.90,15,1335.0
224,202002,19643.0,122.0,99.56,0.175,0.93,14,1403.0
225,202003,23802.0,146.0,95.02,0.184,0.71,16,1488.0


In [32]:
# Import assay data
daily_assays = pd.read_csv('../Data-Analytics-Final_Project/Resources/daily_assays.csv')
month_actuals = pd.read_csv('../Data-Analytics-Final_Project/Resources/month_actuals.csv')

In [30]:
monthly_df['year_month'] = monthly_df['year_month'].astype(int)
monthly_data = month_actuals.merge(monthly_df,on='year_month')
monthly_data

Unnamed: 0,year_month,actual_pA,month,year,total_extracted,number_extract_points,average_exhaustion,average_sA,average_percent_extracted_per_point,run_days,average_extracted_per_day
0,200106,0.187,6,2001,24282.0,33.0,2.26,0.203,6.19,17,1428.0
1,200107,0.178,7,2001,23311.0,29.0,4.86,0.201,17.02,16,1457.0
2,200108,0.194,8,2001,22717.0,59.0,8.82,0.194,3.12,16,1420.0
3,200109,0.202,9,2001,21997.0,77.0,12.34,0.199,1.46,19,1158.0
4,200110,0.190,10,2001,21640.0,76.0,15.53,0.208,1.40,23,941.0
...,...,...,...,...,...,...,...,...,...,...,...
220,201911,0.128,11,2019,14984.0,106.0,90.60,0.180,1.16,16,936.0
221,201912,0.137,12,2019,13234.0,95.0,98.23,0.185,1.21,14,945.0
222,202001,0.141,1,2020,20024.0,114.0,95.30,0.183,0.90,15,1335.0
223,202002,0.150,2,2020,19643.0,122.0,99.56,0.175,0.93,14,1403.0


In [33]:
daily_assays.head()

Unnamed: 0,date,pA,pB,pD,pC,month,year,year_month
0,6/1/2001,0.184,,,,6,2001,200106
1,6/2/2001,0.182,,,,6,2001,200106
2,6/4/2001,0.177,,,,6,2001,200106
3,6/5/2001,0.195,,,,6,2001,200106
4,6/6/2001,0.18,,,,6,2001,200106


In [37]:
month_contaminants = daily_assays.groupby('year_month').mean()
month_contaminants = month_contaminants.drop(columns={'pA','month','year'})
month_contaminants = month_contaminants.rename(columns={'pB':'average_pB','pD':'average_pD','pC':'average_pC'})
month_contaminants

Unnamed: 0_level_0,average_pB,average_pD,average_pC
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200106,,,
200107,,,
200108,,,
200109,,,
200110,,,
...,...,...,...
201911,2.838515,0.007970,0.001930
201912,3.022407,0.008481,0.002070
202001,3.074622,0.009533,0.002242
202002,2.754897,0.010231,0.001921
