In [None]:
# Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import combinations
from statistics import mean
from sklearn.neighbors._base import _get_weights
from sklearn.utils.extmath import weighted_mode
from sklearn.naive_bayes import GaussianNB

In [None]:
import os
os.getcwd()
#%cd "C:\\Users\\graduate\\OneDrive - USU\\Desktop\p6"

In [None]:
# Import Dataset
df = pd.read_csv('RRCA_baseflow.csv')
df['S_Date']=df['Date']-693963
df.head()

In [None]:
# Fix terribly formated date
import datetime
df['dayoyear']=df['Date']%365.2422
df['year']=df['Date']/365.2422
df.year=df.year.astype(int)
df['month']=(df['dayoyear']+30.43685/2)/30.43685
df.month=df.month.astype(int)
df.head()


In [None]:
#see unique values in year column
df1=df['year'].unique()
print(sorted(df1))

In [None]:
# Hot encode segmentid
#convert categorical variable into dummy/ indicator variable
seg_hot = pd.get_dummies(df.Segment_id, prefix='s')
df = df.join(seg_hot)
#df.head()

In [None]:
# Years before 1950 had fewer samples and significantly higher Observed
odf=df
df=df[df.year>1950]

In [None]:
# Months ordered because whilst they are correlated by time, it isn't linear. More like a sine wave
df['adjmonth']=df.month.apply(lambda x:[7,6,8,9,5,10,11,4,0,1,3,2].index(x))
df.head()

In [None]:
sns.barplot(x="month", y="Observed", data=df)

In [None]:
for seg in df.Segment_id.unique():
  dfa=df[df['Segment_id']==seg]
  fig, axs = plt.subplots(1, 3, sharey=True)
  dfa.plot(kind='scatter', x='Evapotranspiration', y='Observed', ax=axs[0], figsize=(16, 8),title = seg)
  dfa.plot(kind='scatter', x='Precipitation', y='Observed', ax=axs[1],title = seg)
  dfa.plot(kind='scatter', x='Irrigation_pumping', y='Observed', ax=axs[2],title = seg)

In [None]:
#location scatter plot
df.plot(kind='scatter', x='x', y='y')

In [None]:
df[df['Irrigation_pumping']<0].plot(kind='scatter', x='year', y='Observed')

In [None]:
x=df.year.unique()
x.sort()
x

In [None]:
#observed baseflow distinction between whole dataset and year after 1950
sns.lineplot(x="year", y="Observed", data=odf)


In [None]:
sns.lineplot(x="year", y="Observed", data=df)

In [None]:
uniquesegments=df.Segment_id.unique()
uniquesegments.sort()
uniquesegments


In [None]:
#see how flows are distributed by segment ID
sns.lineplot(x="Segment_id", y="Observed", data=df)

In [None]:
#lets try for segment ID 51, which is seen to be minimum flow
segments178 = df[df['Segment_id'] == 178]
display(segments178.head())
# each segment is jsut a part where the station is located
segments178['y'].unique()

In [None]:
## Graphs for Segment 96
fig, axs = plt.subplots(1,3, sharey=True)
segments178.plot(kind='scatter', x='Evapotranspiration', y='Observed', ax=axs[0], figsize=(16, 8), title = 178)
segments178.plot(kind='scatter', x='Precipitation', y='Observed', ax=axs[1], title = 178)
segments178.plot(kind='scatter', x='Irrigation_pumping', y='Observed', ax=axs[2], title = 178)

### apply coefficients from linear regression model

In [None]:
#do linear regression for the selected segment
# create X and y
feature_cols = ['Precipitation'] 
X = segments178[feature_cols]
y = segments178.Observed

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
print(list(zip(feature_cols, lm.coef_)))

In [None]:
#do linear regression for the selected segment
# create X and y
feature_cols = ['Evapotranspiration', 'Irrigation_pumping'] 
X = segments178[feature_cols]
y = segments178.Observed

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
print(list(zip(feature_cols, lm.coef_)))

In [None]:
#do linear regression for the selected segment
# create X and y
feature_cols = ['Evapotranspiration', 'Precipitation','Irrigation_pumping'] 
X = segments178[feature_cols]
y = segments178.Observed

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
print(list(zip(feature_cols, lm.coef_)))

### see the effect of each independent variavle in predicting observed baseflow for the selected segment

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Irrigation_pumping', data=segments178).fit()

# print the coefficients
display(lm.params)

# print a summary of 
lm.summary()

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Evapotranspiration + Irrigation_pumping', data=segments178).fit()

# print the coefficients
display(lm.params)

# print a summary of 
lm.summary()

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Evapotranspiration + Irrigation_pumping + Precipitation', data=segments178).fit()

# print the coefficients
display(lm.params)

# print a summary of 
lm.summary()