In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
hydro_data = pd.read_csv('RRCA_baseflow.csv')
hydro_data.columns

In [None]:
hydro_data.head()

In [None]:
hydro_data.info()

In [None]:
hydro_data = hydro_data.dropna()

In [None]:
from datetime import datetime, timedelta, date

hydro_data['Date'] = hydro_data['Date'] - 693963
start_date = date(1900, 1, 1)
hydro_data['Date'] = hydro_data['Date'].apply(lambda x : (start_date + timedelta(days=x)).month)
#hydro_data.head()

In [None]:
from sklearn.preprocessing import StandardScaler
cols = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping', 'Observed']
scaler = StandardScaler()
hydro_data[cols] = scaler.fit_transform(hydro_data[cols])

In [None]:
hydro_data.head()

In [None]:
#plot months chart
month_flow = hydro_data.groupby('Date').agg({'Observed':'mean'}).reset_index().rename(columns={'Observed':'mo_BF_mean' })
sns.barplot(x='Date',y='mo_BF_mean',data=month_flow)

In [None]:
fig, axs = plt.subplots(1, 3, sharey=True)
hydro_data.plot(kind='scatter', x='Evapotranspiration', y='Observed', ax=axs[0], figsize=(16, 8))
hydro_data.plot(kind='scatter', x='Precipitation', y='Observed', ax=axs[1])
hydro_data.plot(kind='scatter', x='Irrigation_pumping', y='Observed', ax=axs[2])

In [None]:
for seg in hydro_data.Segment_id.unique():
  df=hydro_data[hydro_data['Segment_id']==seg]
  fig, axs = plt.subplots(1, 3, sharey=True)
  df.plot(kind='scatter', x='Evapotranspiration', y='Observed', ax=axs[0], figsize=(16, 8),title = seg)
  df.plot(kind='scatter', x='Precipitation', y='Observed', ax=axs[1],title = seg)
  df.plot(kind='scatter', x='Irrigation_pumping', y='Observed', ax=axs[2],title = seg)

In [None]:
means = hydro_data.groupby('Date').agg({'Evapotranspiration':'mean', 'Precipitation':'mean', 'Irrigation_pumping':'mean', 'Observed':'mean'}).reset_index().rename(columns={'Evapotranspiration':'Evapotranspiration_mean','Precipitation' : 'Precipitation_mean','Irrigation_pumping':'Irrigation_pumping_mean','Observed':'BF_mean' })
means.head()

In [None]:
import matplotlib.pyplot as plt
plt.plot(means.Date, means.Evapotranspiration_mean, color='r', label='Evapotranspiration')
plt.plot(means.Date, means.Precipitation_mean, color='g', label='Precipitation')
plt.plot(means.Date, means.Irrigation_pumping_mean, color='y', label='Irrigation_pumping')
plt.plot(means.Date, means.BF_mean, color='b', label='BF')
plt.xlabel("Month")
plt.ylabel("Values")
plt.title("Values over months")
plt.legend()
plt.show()

In [None]:
hydro_data144 = hydro_data[hydro_data.Segment_id == 144]

In [None]:
hydro_data144

In [None]:

display(stats.pearsonr(hydro_data144.Observed, hydro_data144.Evapotranspiration))
display(stats.pearsonr(hydro_data144.Observed, hydro_data144.Precipitation))
display(stats.pearsonr(hydro_data144.Observed, hydro_data144.Irrigation_pumping))

In [None]:
means_seg = hydro_data.groupby('Segment_id').agg({'Evapotranspiration':'mean', 'Precipitation':'mean', 'Irrigation_pumping':'mean', 'Observed':'mean'}).reset_index().rename(columns={'Evapotranspiration':'Evapotranspiration_mean','Precipitation' : 'Precipitation_mean','Irrigation_pumping':'Irrigation_pumping_mean','Observed':'BF_mean' })

means_seg.head()

In [None]:
#sns.scatterplot(data=hydro_data, x='Evapotranspiration', y='Observed')
sns.regplot(hydro_data.Evapotranspiration, hydro_data.Observed, ci=None, marker='o', color='red',
           scatter_kws={'s':10}, line_kws={'color':'blue'})



In [None]:
sns.regplot(hydro_data144.Precipitation, hydro_data144.Observed, ci=None, marker='o', color='red',
           scatter_kws={'s':10}, line_kws={'color':'blue'})

In [None]:
sns.regplot(hydro_data144.Irrigation_pumping, hydro_data144.Observed, marker='o', color='red',
           scatter_kws={'s':10}, line_kws={'color':'blue'})

In [None]:
sns.regplot(hydro_data144.Evapotranspiration, hydro_data144.Observed, marker='o', color='red',
           scatter_kws={'s':10}, line_kws={'color':'blue'})

In [None]:
sns.regplot(hydro_data144.Precipitation, hydro_data144.Observed, marker='o', color='red',
           scatter_kws={'s':10}, line_kws={'color':'blue'})

In [None]:
plt.figure(figsize=(16,8))
ax = sns.barplot(data=means_seg, x='Segment_id', y='BF_mean')

In [None]:
means_seg = means_seg.sort_values(by ='BF_mean', ascending=False).reset_index()

In [None]:
means_seg = means_seg[means_seg['BF_mean'] >= means_seg['BF_mean'].mean()]

In [None]:
means_seg

In [None]:
# hydro_data = hydro_data[hydro_data['Segment_id'].isin(means_seg['Segment_id'].tolist())]

In [None]:
seg_hot = pd.get_dummies(hydro_data.Segment_id, prefix='s')
hydro_data = hydro_data.join(seg_hot)
seg_hot1 = pd.get_dummies(hydro_data.Date, prefix='d')
hydro_data = hydro_data.join(seg_hot1)

In [None]:
# from sklearn.preprocessing import StandardScaler
# cols = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping', 'Observed']
# scaler = StandardScaler()
# hydro_data[cols] = scaler.fit_transform(hydro_data[cols])

In [None]:
hydro_data.drop(['Date','Segment_id'],axis=1, inplace=True)
hydro_data.head()

In [None]:
hydro_data.columns

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Irrigation_pumping + Evapotranspiration + Precipitation', data=hydro_data).fit()

# print the coefficients
display(lm.params)

# print a summary of 
lm.summary()
#lm.pvalues

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Evapotranspiration + Precipitation ', data=hydro_data).fit()

# print the coefficients
#display(lm.params)

# print a summary of 
display(lm.summary())
#display(lm.pvalues)

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Irrigation_pumping + Precipitation + Evapotranspiration + d_1 + d_2 + d_3 + d_4 + d_5 + d_6 + d_7 + d_8 + d_9 + d_10 + d_11 + d_12', data=hydro_data).fit()

# print the coefficients
display(lm.params)

# print a summary of 
lm.summary()
#lm.pvalues

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Irrigation_pumping + Precipitation + Evapotranspiration + s_188 + s_194 + s_205 + s_239 + s_256', data=hydro_data).fit()

# print the coefficients
display(lm.params)

# print a summary of 
lm.summary()
#lm.pvalues

In [None]:
import statsmodels.formula.api as smf
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Irrigation_pumping + Precipitation + Evapotranspiration + s_188 + s_194 + s_205 + s_239 + s_256 + d_1 + d_2 + d_3 + d_4 + d_5 + d_6 + d_7 + d_8 + d_9 + d_10 + d_11 + d_12', data=hydro_data).fit()

# print the coefficients
display(lm.params)

# print a summary of 
lm.summary()
#lm.pvalues

In [None]:
# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
feature_cols =['Irrigation_pumping', 'Precipitation','Evapotranspiration', 's_188', 's_194', 's_205', 's_239', 's_256', 'd_1', 'd_2', 'd_3', 'd_4','d_5', 'd_6', 'd_7', 'd_8', 'd_9', 'd_10', 'd_11', 'd_12']
X = hydro_data[feature_cols]
y = hydro_data.Observed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lm = LinearRegression()
lm.fit(X_train, y_train)

# print intercept and coefficients
print(lm.intercept_)
print(lm.coef_)
print(lm.score(X_train, y_train))
print(lm.score(X_test, y_test))