# A Brief Introduction to Python
## Infrastrukturen Engineering
## HafenCity Universität Hamburg (HCU)
### The University Of The Built Environment And Metropolitan Development

## Modul: BIW-M-211-100 Energie-Infrastructur
## Instructors:
### Prof. Dr.-Ing. Ingo Weidlich
### Pakdad Pourbozorgi Langroudi

In [None]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)

  drive.mount(mount)

  # Switch to the directory on the Google Drive that you want to use
  import os
  drive_root = mount + "/MyDrive/MyCourses"
  
  # Create drive_root if it doesn't exist
  create_drive_root = True
  if create_drive_root:
    print("\nColab: making sure ", drive_root, " exists.")
    os.makedirs(drive_root, exist_ok=True)
  
  # Change to the directory
  print("\nColab: Changing directory to ", drive_root)
  %cd $drive_root
  if os.path.isdir(drive_root+'/Energieinfrastruktur'):
    drive_root = drive_root+'/Energieinfrastruktur'
    %cd $drive_root
    ! git pull
  else:
    ! git clone https://github.com/pakdad/Energieinfrastruktur.git
    drive_root = drive_root+'/Energieinfrastruktur'
    %cd $drive_root

# Regression Analysis
## Using Numpy, Pandas, Matplotlib, and Sklearn Lirbaries

In [None]:
# Import necessary libraries
# Using np as numpy, pd as pandas and plt as pyplot is a convention,
# and it is recomended to keep it as this format in your codes.
from statistics import mean
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style

style.use('fivethirtyeight')
#%matplotlib inline

In [None]:
# reading our data file directly with numpy.
df = pd.read_csv('Data/regression.csv', index_col=0)
df.head() # method head returns the first five rows of the dataframe
# to see the last five rows the method tail could be used.
#df.tail()
# to see arbitrary row numbers, the number could be given in method head or tail as follow:
#df.head(10)

In [None]:
# function len Return the number of items in a list, array, dataframe, ...:
len(df)

In [None]:
df.describe()

In [None]:
df.info()

The Linear Equation Is:
Mathematically, a linear relationship is one that satisfies the equation:

\begin{aligned} &y = mx + b \\ &\textbf{where:}\\ &m=\text{slope}\\ &b=\text{y-intercept}\\ \end{aligned}

“x” and “y” are two variables which are related by the parameters “m” and “b”. Graphically, y = mx + b plots in the x-y plane as a line with slope “m” and y-intercept “b.” The y-intercept “b” is simply the value of “y” when x=0. The slope “m” is calculated from any two individual points (x1, y1) and (x2, y2) as:

\begin{aligned}m = \frac{(y_2 - y_1)}{(x_2 - x_1)}
\end{aligned}

In [None]:
PEMDAS

In [None]:
print(5-2*3)

In [None]:
list_1 = [1,2,3,5]

In [None]:
mean(list_1)

In [None]:
def best_fit_slope(xs, ys):
    ''' This function fits the regression line and find the slope and intercept '''
    m = ( ((mean(xs)*mean(ys)) - (mean(xs*ys))) /
         (mean(xs)**2  - mean(xs**2)) )
    b = mean(ys) - m*mean(xs)
    return m, b

def se(ys, regression):
    ''' This function returns the squared error '''
    return sum((ys-regression)**2)

def cod(ys, regression):
    ''' This function returns the Coefficient of determination (r^2) '''
    ys_mean = [mean(ys) for y in ys]
    reg_se = se(ys, regression)
    ys_se = se(ys, ys_mean)
    cod = 1 - (reg_se/ys_se)
    return cod

In [None]:
# find the slope and intecept of regression line
m, b = best_fit_slope(df['xs'], df['ys'])
regression = m*df['xs']+b

In [None]:
# Visualize our data and regression line
plt.scatter(df['xs'], df['ys'])
plt.plot(df['xs'], regression, )
plt.xlabel('Temperature (°C)')
plt.ylabel('Service-Life (Year)')
plt.show()

In [None]:
m

In [None]:
b

In [None]:
# predict a service life of a pipe if we operate it on 95°c
predict_x = 95
predict_y = (m*predict_x)+b
print(predict_y) # our prediction
print(cod(df['ys'], regression)) # the accuracy

In [None]:
# Visualize our data and regression line including the predicted point
plt.scatter(df['xs'], df['ys'])
plt.scatter(predict_x, predict_y, s=100, color='red')
plt.plot(df['xs'], regression)
plt.xlabel('Temperature (°C)')
plt.ylabel('Year')
plt.show()

In [None]:
# we can check the standard deviation
np.std(df['ys'])

The Formula for Multiple Linear Regression Is
\begin{aligned} &y_i = \beta_0 + \beta _1 x_{i1} + \beta _2 x_{i2} + ... + \beta _p x_{ip} + \epsilon\\ &\textbf{where, for } i = n \textbf{ observations:}\\ &y_i=\text{dependent variable}\\ &x_i=\text{expanatory variables}\\ &\beta_0=\text{y-intercept (constant term)}\\ &\beta_p=\text{slope coefficients for each explanatory variable}\\ &\epsilon=\text{the model's error term (also known as the residuals)}\\ \end{aligned} 

In [None]:
# reading a dataframe for Multiple Linear Regression
df = pd.read_csv('Data/multi_regression.csv', delimiter=",")
df

In [None]:
# drop the unnamed: 0 column
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

In [None]:
# check the linearity with our defined single linearity
# regression function for the columns age and temperature
m, b = best_fit_slope(df['Age'], df['Temperature'])
regression = m*df['Age']+b

In [None]:
# visualize our regression fit
plt.scatter(df['Age'], df['Temperature'])
plt.plot(df['Age'], regression)
plt.xlabel('Age')
plt.ylabel('Temperature (°C)')
plt.show()

In [None]:
# we can apply the same principle for the columns age and flow rate
m, b = best_fit_slope(df['Age'], df['Flow_Rate'])
regression = m*df['Age']+b

In [None]:
# visualize our regression fit
plt.scatter(df['Age'], df['Flow_Rate'])
plt.plot(df['Age'], regression)
plt.xlabel('Age')
plt.ylabel('Flow Rate')
plt.show()

In [None]:
# read two more data frames
df2 = pd.read_csv('Data/multi_regression2.csv', delimiter=",")
df3 = pd.read_csv('Data/multi_regression3.csv', delimiter=",")
# concatanate the dataframes to form a single large dataframe
frames = [df, df2, df3]
df = pd.concat(frames)
df.describe() # this method describes the dataframe characteristics

In [None]:
df.head()

In [None]:
# drop the non relevant columns and shuffle it to imitate a real-world dataset
df = df.drop(['Unnamed: 0'], axis=1)
df = df.sample(frac = 1).reset_index()
df = df.drop(['index'], axis=1)
df.head()

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Features are X and lables are y
X = np.array(df.drop(['Age'], 1))
y = np.array(df['Age'])

X = preprocessing.scale(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = round(clf.score(X_test, y_test), 4)*100
print(f"The accuracy of the trained model with LinearRegression is : {accuracy}%")


In [None]:
forecast_set = clf.predict(X)
#print(forecast_set, accuracy)
df = df.assign(Forecast = forecast_set)

In [None]:
df