<a href="https://colab.research.google.com/github/9characters/Artificial-Intelligence/blob/master/Linear_Regression_Annual_Discharge_SA_River.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Dataset

In [2]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
import plotly.graph_objects as go
# To install the package: pip install plotly
# P.S. Don't run the line if interactive plot below is not required

# Reading and processing the dataset

In [19]:
orig_dataset = pd.read_csv("Q_Day_San_Antonio_River.csv", skiprows=37, names=["Date", "Time", "Value"], delimiter=";", usecols=[0,2])

In [20]:
# Removing the rows with Values: -999.000
drop_indices = orig_dataset[orig_dataset['Value'] == -999.000].index
orig_dataset.drop(drop_indices, inplace = True)

In [21]:
dataset = orig_dataset.copy()
dataset['Date'] = pd.to_datetime(dataset['Date'])

#Linear Regression Analysis with respect to Mean Annual Discharge (MQ)

In [None]:
# Processing the dataset with respect to MQ
MQ = dataset.resample('Y', on='Date').mean().reset_index()
MQ['Date'] = pd.DatetimeIndex(MQ['Date']).year
MQ.rename(columns = {"Date": "Year"}, inplace = True)
MQ = MQ.dropna()

In [None]:
# Splitting into dependent(y) and Independent(X) variables
X, y = MQ.iloc[:,0].values, MQ.iloc[:,1].values
X = np.expand_dims(X, axis=1)

In [None]:
# Using the scikit-learn Linear Regression model to predict the values
MQ_LR_model = LinearRegression()
MQ_history = MQ_LR_model.fit(X, y)
y_pred = MQ_LR_model.predict(X)

In [None]:
# Display Coefficient and Intercept
print(f"Coefficient: {round(MQ_history.coef_[0],3)}")
print(f"Intercept: {round(MQ_history.intercept_,3)}")

# Scatter-Line Plot
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.title("Mean Annual Discharge(MQ) of San Antonio River")
plt.xlabel("Year")
plt.ylabel("Discharge 'Cum/s' ")
plt.savefig(f'MQ_Curve.png', dpi=150)
plt.show()

In [None]:
'''
Interactive Plot showing the actual MQ and predicted MQ using LR in the given year
Format: (Actual, Predicted)
         Year
'''
fig = go.Figure()
fig.add_trace(go.Scatter(x=y, y=y_pred, mode='markers', text=X[:,0]))
fig.update_layout(title_text = "Mean Annual Discharge")
fig.show()

In [None]:
# Saving the Actual and Predicted Values in a CSV File
df = pd.DataFrame()
df["Year"] = X.flatten().astype(int)
df["Actual MQ"] = np.round(y, 3)
df["Predicted MQ"] = np.round(y_pred, 3)
df.to_csv("MQ Actual vs Predicted.csv", index=False)

#Linear Regression Analysis with respect to Maximum Annual Discharge (maxQ)

In [None]:
# Processing the dataset with respect to maxQ
maxQ = dataset.resample('Y', on='Date').max().reset_index(drop="True")
maxQ['Date'] = pd.DatetimeIndex(maxQ['Date']).year
maxQ.rename(columns = {"Date": "Year"}, inplace = True)
maxQ = maxQ.dropna()

In [None]:
# Splitting into dependent(y) and Independent(X) variables
X, y = maxQ.iloc[:,0].values, maxQ.iloc[:,1].values
X = np.expand_dims(X, axis=1)

In [None]:
# Using the scikit-learn Linear Regression model to predict the values
maxQ_LR_model = LinearRegression()
maxQ_history = maxQ_LR_model.fit(X, y)
y_pred = maxQ_LR_model.predict(X)

In [None]:
print(f"Coefficient: {round(maxQ_history.coef_[0],3)}")
print(f"Intercept: {round(maxQ_history.intercept_,3)}")

# Scatter-Line Plot
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.title("Maximum Annual Discharge(maxQ) of San Antonio River")
plt.xlabel("Year")
plt.ylabel("Dishcharge 'Cum/s' ")
plt.savefig(f'maxQ_Curve.png', dpi=150)
plt.show()

In [None]:
'''
Interactive Plot showing the actual maxQ and predicted maxQ using LR in the given year
Format: (Actual, Predicted)
         Year
'''
fig = go.Figure()
fig.add_trace(go.Scatter(x=y, y=y_pred, mode='markers', text=X[:,0]))
fig.update_layout(title_text = "Maximum Annual Discharge")
fig.show()

In [None]:
# Saving the Actual and Predicted Values in a CSV File
df = pd.DataFrame()
df["Year"] = X.flatten().astype(int)
df["Actual maxQ"] = np.round(y, 3)
df["Predicted maxQ"] = np.round(y_pred, 3)
df.to_csv("maxQ Actual vs Predicted.csv", index=False)

#Linear Regression Analysis with respect to Minimum Annual Discharge (minQ)

In [None]:
# Processing the dataset with respect to minQ
minQ = dataset.resample('Y', on='Date').min().reset_index(drop="True")
minQ['Date'] = pd.DatetimeIndex(minQ['Date']).year
minQ.rename(columns = {"Date": "Year"}, inplace = True)
minQ = minQ.dropna()

In [None]:
# Splitting into dependent(y) and Independent(X) variables
X, y = minQ.iloc[:,0].values, minQ.iloc[:,1].values
X = np.expand_dims(X, axis=1)

In [None]:
# Using the scikit-learn Linear Regression model to predict the values
minQ_LR_model = LinearRegression()
minQ_history = minQ_LR_model.fit(X, y)
y_pred = minQ_LR_model.predict(X)

In [None]:
print(f"Coefficient: {round(minQ_history.coef_[0],3)}")
print(f"Intercept: {round(minQ_history.intercept_,3)}")

# Scatter-Line Plot
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.title("Minumum Annual Discharge(minQ) of San Antonio River")
plt.xlabel("Year")
plt.ylabel("Dishcharge 'Cum/s' ")
plt.savefig(f'minQ_Curve.png', dpi=150)
plt.show()

In [None]:
'''
Interactive Plot showing the actual MQ and predicted MQ using LR in the given year
Format: (Actual, Predicted)
         Year
'''
fig = go.Figure()
fig.add_trace(go.Scatter(x=y, y=y_pred, mode='markers', text=X[:,0]))
fig.update_layout(title_text = "LR")
fig.show()

In [None]:
# Saving the Actual and Predicted Values in a CSV File
df = pd.DataFrame()
df["Year"] = X.flatten().astype(int)
df["Actual minQ"] = np.round(y, 3)
df["Predicted minQ"] = np.round(y_pred, 3)
df.to_csv("minQ Actual vs Predicted.csv", index=False)

#To get the date of maximum flow

In [None]:
import pandas as pd
import numpy as npf

# Reading the dataset
orig_dataset = pd.read_csv("Q_Day_San_Antonio_River.csv", skiprows=37, names=["Date", "Time", "Value"], delimiter=";", usecols=[0,2])

# Removing the rows with Values: -999.000
drop_indices = orig_dataset[orig_dataset['Value'] == -999.000].index
orig_dataset.drop(drop_indices, inplace = True)

# Making the copy of the dataset so that original dataset is not altered and dropping the NaN values if any
test = orig_dataset.copy()
test = test.dropna()

# Converting into python lists
values = list(test['Value'])
dates = list(test['Date'])

# Now we also need years to compare so extracting the years -> Some years might be eliminated while removing -999.000
test['Date'] = pd.to_datetime(test['Date'])
test = test.resample('Y', on='Date').mean().reset_index()
test['Date'] = pd.DatetimeIndex(test['Date']).year
test.rename(columns = {"Date": "Year"}, inplace = True)
test = test.dropna()

# Converting into python list
years = list(test['Year'])

# List to store the (max_date, max_value) combination
data_list = list()

# Looping over the years
for year in years:

	# Using list comprehension to extract only the the date and values for the given year in the loop
    temp_list = [(date, value) for date, value in zip(dates, values) if int(date.split('-')[0]) == year]

    # Making 2 separate list of dates and values for the given year and finding the max index
    max_value_index = np.argmax(list(zip(*temp_list))[1])

    # Getting the max_date and max_value using max_index
    max_date, max_value = temp_list[max_value_index]

    # Adding the max_date and max_value in the data_list
    data_list.append((max_date, max_value))
    print(f"Year {year}: Done")

# Creating a pandas dataframe and adding the data_list into a CSV File
max_df = pd.DataFrame(data_list, columns=['Max Date', 'Max Value'])
max_df.to_csv("Date with max Values.csv", index=False)

#To get the number of days exceeding MHQ and not reaching MLQ

First, we need to find the MHQ and MLQ.

MHQ: Mean of all the max values in the timeseries

MLQ: Mean of all the min values in the timeseries

In [None]:
import pandas as pd
import numpy as np

orig_dataset = pd.read_csv("Q_Day_San_Antonio_River.csv", skiprows=37, names=["Date", "Time", "Value"], delimiter=";", usecols=[0,2])

# Removing the rows with Values: -999.000
drop_indices = orig_dataset[orig_dataset['Value'] == -999.000].index
orig_dataset.drop(drop_indices, inplace = True)

# Making the copy of the original dataset
test = orig_dataset.copy()

# Converting to pandas understandable datetime format
test['Date'] = pd.to_datetime(test['Date'])
test = test.dropna()

# Getting the maximum and minimum values for each year
maxQ = (test.resample('Y', on='Date').max().reset_index(drop="True")).dropna()
minQ = (test.resample('Y', on='Date').min().reset_index(drop="True")).dropna()

# Getting the MHQ and MLQ values
MHQ = np.mean(maxQ['Value'])
MLQ = np.mean(minQ['Value'])

# Now we also need years to compare so extracting the years -> Some years might be eliminated while removing -999.000
test['Date'] = pd.to_datetime(test['Date'])
test = test.resample('Y', on='Date').mean().reset_index()
test['Date'] = pd.DatetimeIndex(test['Date']).year
test.rename(columns = {"Date": "Year"}, inplace = True)
test = test.dropna()

# Converting into python list
years = list(test['Year'])

# List to store the (Year, No of days) combination
data_list = list()

# Looping over the years
for year in years:

    # Initializing the counter
    counter = 0

    # Looping over the date and values
    for date, value in zip(dates, values):

        # Checking if the date belongs the "year" in the main loop, and value is not in the range (MHQ, MLQ)
        if (int(date.split('-')[0]) == year) and ((value > MHQ) or (value < MLQ)):
            counter += 1
    
    data_list.append((year, counter))
    print(f"Year {year}: Done")

# Creating a pandas dataframe and adding the data_list into a CSV File
max_df = pd.DataFrame(data_list, columns=['Year', 'No of Days'])
max_df.to_csv("MHQ MLQ range count.csv", index=False)