# Unit 8 Linear Regression
## Activity: Fitting a linear regression line using intuitive visual approach.


This script allows you to interact with the figures and adjust the intercept and slope
of the line. 

Example data are taken from Collaborative Statistics Book Fig. 12.7:

**Third exams score and Final exams score**

In [None]:
# -*- coding: utf-8 -*-
"""

@author: Oliver Elison Timm

History:
2022-04-27 removed all interactive widgets and converted into 
           regular notebook activity. 

"""

import numpy as np
import matplotlib.pyplot as plt
plt.style.use("default")
from scipy.stats import linregress
import pandas as pd
%matplotlib inline

### Example data: Third exam scores and final exam scores

In [None]:
##############################################################
# Grade scores for the third (x) and final exam (y)
##############################################################
x=np.array([ 65, 67, 71, 71, 66, 75, 67, 70, 71, 69, 69])
y=np.array([175,133,185,163,126,198,153,163,159,151,159])


    

In [None]:
def SSE(x,y,slope,intercept=0):
    """Calculate SSE for a guessed linear regression line
    
    Input parameter:
        x,y (1d numpy array): 
            the numpy arrays with the data points
        slope (float): 
            real value for the regression line slope
        
        intercept (float): intercept of the line (default 0)
    Returns:
        The SSE value (float)
    """
    yhat=intercept+slope*x
    error=y-yhat
    result=0
    for e in error:
        result=result+e**2
    return result


   

## Task 1: Enter slope and intercept and calculate fitted the line

#### Calculates the SSE

What is visually the best fitting line? Adjust the slope and estimate visually the line that intuitively fits best to the data. Here, we work with the absolute scores, below we will work with centered data.


### Enter your guess values for the line parameters: intercept (a) and slope (b)


#### Example for girst guess values values: y = 50 +1.5*x 


In [None]:
a=float(input("Enter intercept a ="))
b=float(input("Enter slope b = "))

In [None]:
yguess=a+b*x # x contains the third grade values
# collect data information to show as data table
print("Estimated final grades with regression line")
print(f"yguess= {a:.2f} + {b:.2}*x")

dfout=pd.DataFrame({'third_exam':x,'final_exam':y,
                    'regression_line_final_exam':np.round(yguess,2)})
dfout['squared error']=(y-yguess)**2

sse_guessed=SSE(x,y,slope=b,intercept=a)
print(f"SSE for this guessed regression line: SSE = {sse_guessed:.2f}")

print(40*"=")
dfout

In [None]:
# check the SSE result using 
dfout['squared error'].sum()

### Make scatterplot and the guessed linear regression line.

In [None]:
# need the min max range of the data to adjust sample spaces
# for the following linear regression analysis
xmin=int(np.min(x))-1
xmax=int(np.max(x))+1
ymin=int(np.min(y))-1
ymax=int(np.max(y))+1
x0=np.linspace(xmin,xmax,10)


fig,ax = plt.subplots(1,1,figsize=(6,6))


plt.plot(x,y,'+',markersize=12,color='black',label='data points')
ax.plot(x,yguess,lw=2,color='red',label='guessed fitted line') 
ax.text
ax.set_xlim([xmin,xmax])
ax.set_ylim([ymin,ymax])
ax.set_xlabel("third exam score")
ax.set_ylabel("final exam score")
plt.title('Final exam score vs third exam score')

plt.legend()
plt.grid()
plt.show()




## Task 2: Work with centered data
## Enter slope and intercept and calculate fitted the line

#### Calculates the SSE


Here, we work with the centered data. The intercept we set to zero and 
just test various slope values.

Note: Ordinary linear regression lines for centered data go through the intercept a=0


In [None]:
a=0
b=float(input("Enter slope b = "))


In [None]:
## We center the data around their mean values
# for illustrative purposes
# that allows us to work with intercept 0 
# and focus on finding the best fitting slope
x=x-np.mean(x)
y=y-np.mean(y)

yguess=a+b*x # x contains the third grade values
# collect data information to show as data table
print("Estimated final grades with regression line")
print(f"yfit= {a:.2f} + {b:.2}*x")
dfout=pd.DataFrame({'third_exam':x,'final_exam':y,
                    'regression_line_final_exam':np.round(yguess,2)})
dfout['squared error']=(y-yguess)**2

sse_guessed=SSE(x,y,slope=b,intercept=a)
print(f"SSE for this guessed regression line: SSE = {sse_guessed:.2f}")
#dfout

In [None]:
# need the min max range of the data to adjust sample spaces
# for the following linear regression analysis
xmin=int(np.min(x))-1
xmax=int(np.max(x))+1
ymin=int(np.min(y))-1
ymax=int(np.max(y))+1
x0=np.linspace(xmin,xmax,10)


fig,ax = plt.subplots(1,1,figsize=(6,6))


plt.plot(x,y,'+',markersize=12,color='black',label='data points')
ax.plot(x,yguess,lw=2,color='red',label='guessed fitted line') 
ax.text
ax.set_xlim([xmin,xmax])
ax.set_ylim([ymin,ymax])
ax.set_xlabel("third exam score (centered)")
ax.set_ylabel("final exam score (centered)")
plt.title('Final exam score vs third exam score (anomalies)')

plt.legend()
plt.grid()
plt.show()




### Task 3: Test multiple values for the slope b and collect the resulting SSE
#### Collect the slope and corresponding SSE values in lists, then plot the SSE over the slope values.


In [None]:
# start with this and run the next code cell to plot the results
trial_and_error= True
if trial_and_error:
    sse_list = [3620.18, 3382, 3076,  2696,  2480,   2426,  2536  , 2809 , 3246] # example values
    slope_list= [ 1.0,    1.5,    2,     3,     4,      5,     6   ,   7 ,   8] # example values
else:
    # then do this and run the cell below and plot the results
    sse_list=[] 
    slope_list=[]
    b=0
    while b<10:
        sse_list.append(SSE(x,y,b,intercept=0))
        slope_list.append(b)
        b=b+0.1
         

In [None]:
min_slope,max_slope=0,10
min_sse,max_sse = 1000, 6000

# improve the visual appearance of the plots



plt.style.use("ggplot")

fig,ax= plt.subplots(1,2,figsize=(12,4))
ax[0].set_ylim([min_sse,max_sse])
ax[0].set_xlim([min_slope,max_slope])
ax[0].set_xlabel("slope parameter")
ax[0].set_ylabel("SSE")
ax[0].plot(slope_list,sse_list,'x',color='green')

ax[0].set_title("Sum of Squared Errors")


ax[1].set_ylim([2300,2800])
ax[1].set_xlim([3.5,6.5])
ax[1].set_xlabel("slope parameter")
ax[1].set_ylabel("SSE")
ax[1].plot(slope_list,sse_list,'x',color='green')
ax[1].set_title("Sum of Squared Errors")

plt.show()

### Task 5: What is the slope for the best fitting line  that minimizes the Sum of Squared Errors? 
#### (work with centered data)
Use the graph as support to find the best fitting slope parameter.
And compare it with the scipy function _linregress()_ results.


In [None]:
b,a, r, pvalue, dump = linregress(x,y)
print(f"slope b: {b:.4f},intercept a: {a:.4f}, correlation coefficient r: {r:.4f} , p-value {pvalue:.4e}")

In [None]:
plt.scatter(x,y)
# get the fitted line y-values for each x value
yhat=a+b*x
# style '-x' gives line connecting the points and using x as a point marker
plt.plot(x,yhat,'-x',c='blue')
plt.xlabel("third exam score")
plt.ylabel("final exam score")
plt.annotate("fitted line",(4,20),(2.5,30),\
             fontsize=14,arrowprops={"arrowstyle":"simple","color":"gray"})

---
### Supplementary code:

More detailed text-based information on the linear regression results

In [None]:
b, a, r, p_value,dump =linregress(x,y)
print("The linear regression line that minimizes the SSE:")
print(f"yfit={a:.2f}+{b:.2f}*x")
print(f"with correlation r={r:.2f}")
print(f"and p-value to H0: 'b is equal 0' is p={p_value:.5f}")
