# Exercises

Here the dataset: bit.ly/3C8JzrM

#### Ex. 1
#### Perform a simple linear regression to find the $m$ and $b$ values that minimizes the loss (sum of squares).



In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# DF
df = pd.read_csv("https://bit.ly/3C8JzrM", delimiter=",", header=0)
df.head()

# Extract input vars - all rows and the first column
X = df.values[:, :-1]
# Extract output - all rows and the last column
Y = df.values[:, -1]

print("X:", X)
print("\n")
print("Y:",Y)

# Istantiate the linear regression
fit = LinearRegression().fit(X,Y)

# Results
m = fit.coef_.flatten()  # Slope
b = fit.intercept_.flatten()  # Intercept
print("Slope:", m)
print("Intercept:", b)

X: [[ 1.]
 [ 2.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 6.]
 [ 7.]
 [ 8.]
 [ 9.]
 [10.]
 [11.]
 [12.]
 [13.]
 [14.]
 [15.]
 [16.]
 [17.]
 [18.]
 [19.]
 [20.]
 [21.]
 [22.]
 [23.]
 [24.]
 [25.]
 [26.]
 [27.]
 [28.]
 [29.]
 [30.]
 [31.]
 [32.]
 [33.]
 [34.]
 [35.]
 [36.]
 [37.]
 [38.]
 [39.]
 [40.]
 [41.]
 [42.]
 [43.]
 [44.]
 [45.]
 [46.]
 [47.]
 [48.]
 [49.]
 [50.]
 [51.]
 [52.]
 [53.]
 [54.]
 [55.]
 [56.]
 [57.]
 [58.]
 [59.]
 [60.]
 [61.]
 [62.]
 [63.]
 [64.]
 [65.]
 [66.]
 [67.]
 [68.]
 [69.]
 [70.]
 [71.]
 [72.]
 [73.]
 [74.]
 [75.]
 [76.]
 [77.]
 [78.]
 [79.]
 [80.]
 [81.]
 [82.]
 [83.]
 [84.]
 [85.]
 [86.]
 [87.]
 [88.]
 [89.]
 [90.]
 [91.]
 [92.]
 [93.]
 [94.]
 [95.]
 [96.]
 [97.]
 [98.]
 [99.]]


Y: [-13.11584315  25.80654738  -5.01728537  20.25641536   4.07500348
  -3.53026008  24.04599852  22.11256639   5.96859103  43.39233851
  32.22464253  14.66614212  17.9661405   -2.75471825  25.15684021
  20.18287021  22.28192946  16.75744707  54.21957466  60.56415065
  84.36604948  51.24113209   6.359

#### Ex. 2
#### Calculate the correlation coefficient and statistical significance of this data (95% confidence). Is the correlation useful?

In [40]:
import pandas as pd
from scipy.stats import t
from numpy import sqrt

# DF
df = pd.read_csv("https://bit.ly/3C8JzrM", delimiter=",", header=0)
df.head()

# Positive and strong correlation
correlations = df.corr()
print(correlations)
r = 0.92421

# Statistical significance - if our t value is outside the critical 
# value range we reject H_0 meaning the correlation did not happen by chance
n = len(df)
lower_cv = t(n-2).ppf(.025)
higher_cv = t(n-2).ppf(.975)
# T-test value
t_test_value = r / sqrt((1- r**2) / (n - 2))
# Result
if t_test_value > higher_cv or t_test_value < lower_cv:
    print("We reject H_0.")
else:
    print("We cannot reject H_0.")

         x        y
x  1.00000  0.92421
y  0.92421  1.00000
We reject H_0.


#### Ex. 3
#### If I predict where $x = 50$, what is the 95% prediction interval for the predicted value of y?

We just need to add/subtract the $margin$ of $error$ to $y$ $hat$:
- $E = t_{0.25} * SEE * \sqrt{1 + \dfrac{1}{n} + \dfrac{n(x_0 - \bar{x})^2}{n(\sum{x^2}) - (\sum{x^2})}}$

In [47]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from scipy.stats import t
from math import sqrt

# DF
df = pd.read_csv("https://bit.ly/3C8JzrM", delimiter=",", header=0)
points = list(pd.read_csv("https://bit.ly/3C8JzrM", delimiter=",").itertuples())

# Extract input vars - all rows and the first column
X = df.values[:, :-1]
# Extract output - all rows and the last column
Y = df.values[:, -1]

# Linear regression
fit = LinearRegression().fit(X,Y)
m = fit.coef_.flatten()  # Slope
b = fit.intercept_.flatten()  # Intercept

# y_hat
n = len(points)
x_mean = sum(point.x for point in points) / n
x_0 = 50
y_hat = m*x_0 + b
print("y_hat:", y_hat)

# t value at 0.025
t_value = t(n-2).ppf(0.025)

# Margin of error
E = t_value * SEE * \
    sqrt(1 + (1 / n) + (n*(x_0 - x_mean)**2 / \
    (n*sum(point.x**2 for point in points) - \
    sum(point.x for point in points)**2)))

# SEE
SEE = sqrt((sum((point.y - (m*point.x + b))**2 for point in points)) / (n - 2))

# E, margin of error
E = t_value * SEE * \
    sqrt(1 + (1 / n) + (n*(x_0 - x_mean)**2 / \
    (n*sum(point.x**2 for point in points) - \
    sum(point.x for point in points)**2)))

# CI 95% around y_hat
lower_CI = y_hat - E
higher_CI = y_hat + E
print(lower_CI, higher_CI)

y_hat: [92.65325395]
[134.5144215] [50.7920864]


#### Ex. 4
#### Start your regression over and do a train/test split. Feel free to experiment with cross-validation and random-fold validation. Does the linear regression perform well and consistently on the testing data? Why or why not?

In [46]:
# Simple train/test split
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score

# DF
df = pd.read_csv("https://bit.ly/3C8JzrM", delimiter=",", header=0)

# Extract input vars - all rows and the first column
X = df.values[:, :-1]
# Extract output - all rows and the last column
Y = df.values[:, -1]

# Train 8/10, test 2/10
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/5)

# Set up model
model = LinearRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("R^2:", result)

R^2: 0.8440626048414025


In [50]:
# Three-fold cross-validatiion for a linear regression
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score

# DF
df = pd.read_csv("https://bit.ly/3C8JzrM", delimiter=",", header=0)

# Extract input vars - all rows and the first column
X = df.values[:, :-1]
# Extract output - all rows and the last column
Y = df.values[:, -1]

# Set up model with 3 fold cv and perform linear regression with MSE and STD metrics
kfold = KFold(n_splits=3, random_state=7, shuffle=True)
model = LinearRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("MSE:", results.mean(), "STD:", results.std())

MSE: 0.8336375693565597 STD: 0.03628076503574302


In [49]:
# Random fold cross-validatiion for a linear regression
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, cross_val_score

# DF
df = pd.read_csv("https://bit.ly/3C8JzrM", delimiter=",", header=0)

# Extract input vars - all rows and the first column
X = df.values[:, :-1]
# Extract output - all rows and the last column
Y = df.values[:, -1]

# Set up model and perform linear regression with MSE and STD metrics
# 100 iterations, each iterations has a test data of 1/3
kfold = ShuffleSplit(n_splits=100, test_size=.33, random_state=7)
model = LinearRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("MSE:", results.mean(), "STD:", results.std())

MSE: 0.8373221715266962 STD: 0.03777483738122684
