In [1]:
import pandas as pd
import numpy as np
import sklearn.linear_model as sklearn
import matplotlib.pyplot as plotLib
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures

#linear regression model
lm = sklearn.LinearRegression()

#Load data fra csv
AutoDataSet = pd.read_csv('C:/Users/Morten From/Desktop/Auto.csv').dropna()
#AutoDataSet1.info()

In [2]:
#Finder de to dataset
HorsePowerData = AutoDataSet['horsepower'].values.reshape(-1,1)
MPGData = AutoDataSet['mpg']

#Finder de enkelte data og opdeler dem i test og train
trainData = AutoDataSet.sample(196, random_state = 1) #Random udvalgt train
testData = AutoDataSet[~AutoDataSet.isin(trainData)].dropna(how = 'all') #Burde resultere i de ikke udvalgte til train data
#train
HorsePowerData_train = trainData['horsepower'].values.reshape(-1,1)
MPGData_train = trainData['mpg']
#test
HorsePowerData_test = testData['horsepower'].values.reshape(-1,1)
MPGData_test = testData['mpg']

In [3]:
#Fit model til data
linearRegressionModel = lm.fit(HorsePowerData, MPGData)
#laver leaveOneOut
looVal = LeaveOneOut()
n_splits = looVal.get_n_splits(HorsePowerData)
#Finder n_splits til vores k_Fold
crossVal = KFold(n_splits)
#Printer resultat af crossval, som antal folds, MSE og standard deviation
Results = cross_val_score(linearRegressionModel, HorsePowerData, MPGData, scoring="neg_mean_squared_error", cv=crossVal)
print("Folds: " + str(len(Results)) + ", MSE: " + str(np.mean(np.abs(Results))) + ", STD: " + str(np.std(Results)))

Folds: 392, MSE: 24.231513517929226, STD: 36.79731503640535


In [4]:
for i in range(1,6):
    polynomialFeat = PolynomialFeatures(degree=i)
    HorsePower = polynomialFeat.fit_transform(HorsePowerData)
    linearRegressionModel = lm.fit(HorsePower, MPGData)
    scores = cross_val_score(linearRegressionModel, HorsePower, MPGData, scoring="neg_mean_squared_error", cv=crossVal)
    print("Degree:"+str(i)+" polynomial, MSE: " + str(np.mean(np.abs(scores))) + ", STD: " + str(np.std(scores)))

Degree:1 polynomial, MSE: 24.231513517929226, STD: 36.79731503640535
Degree:2 polynomial, MSE: 19.248213124489396, STD: 34.998446151782474
Degree:3 polynomial, MSE: 19.334984064133813, STD: 35.76513567812919
Degree:4 polynomial, MSE: 19.424430309411886, STD: 35.68335275769751
Degree:5 polynomial, MSE: 19.033211842978396, STD: 35.31729288251292


In [5]:
#Kfold 5.3.3
crossVal = KFold(n_splits=10)

for i in range(1,11):
    polynomialFeat = PolynomialFeatures(degree=i)
    HorsePower = polynomialFeat.fit_transform(HorsePowerData)
    model = lm.fit(HorsePower, MPGData)
    scores = cross_val_score(model, HorsePower, MPGData, scoring="neg_mean_squared_error", cv=crossVal)
    
    print("Degree:"+str(i)+" polynomial, MSE: " + str(np.mean(np.abs(scores))) + ", STD: " + str(np.std(scores)))

Degree:1 polynomial, MSE: 27.439933652339864, STD: 14.510250711281135
Degree:2 polynomial, MSE: 21.23584005580211, STD: 11.797327528898292
Degree:3 polynomial, MSE: 21.336606183328694, STD: 11.844339714637215
Degree:4 polynomial, MSE: 21.353886994209773, STD: 11.986332342224673
Degree:5 polynomial, MSE: 20.905646119059934, STD: 12.18560440073758
Degree:6 polynomial, MSE: 20.82189095906726, STD: 12.126258882595026
Degree:7 polynomial, MSE: 20.953534894379217, STD: 12.060019626712842
Degree:8 polynomial, MSE: 21.077131510426256, STD: 12.04447106023584
Degree:9 polynomial, MSE: 21.03675183384266, STD: 11.948760351967676
Degree:10 polynomial, MSE: 20.981013741561554, STD: 11.797365253121383


In [6]:
#bootstrap 5.3.4
PortFolioData = pd.read_csv('C:/Users/Morten From/Desktop/Portfolio.csv')

#Define alpha, which takes x y and returns alpha
def alphaFunction(X,Y):
    return ((np.var(Y)-np.cov(X,Y)[0][1])/(np.var(X)+np.var(Y)-2*np.cov(X,Y)[0][1]))

#Estimate alpha for the full 100 samples in the portfolio
x = PortFolioData.X[0:100]
y = PortFolioData.Y[0:100]
print("Alpha for all 100 samples in portfolio:")
print(alphaFunction(x,y))

Alpha for all 100 samples in portfolio:
0.5766511516104118


In [7]:
#Random replacement of 
samples = PortFolioData.sample(frac=1, replace=True)
x = samples.X[0:100]
y = samples.Y[0:100]
print("Alpha for new samples in portfolio:")
print(alphaFunction(x,y))

Alpha for new samples in portfolio:
0.6885263919413074


In [8]:
totalAlpha = 0
alphaValues = []
AlphaValue = 0
for i in range(0,1000):
        samples = PortFolioData.sample(frac=1, replace=True)
        x = samples.X[0:100]
        y = samples.Y[0:100]
        #Alpha = alphaFunction(x,y)
        #print("Alpha for new samples in portfolio:")
        #print(Alpha)
        AlphaValue = alphaFunction(x,y)
        totalAlpha += AlphaValue
        alphaValues.append(AlphaValue)
AproxAlpha = totalAlpha / 1000
print(AproxAlpha)
print("STD: " + str(np.std(alphaValues)))

0.5815197047487938
STD: 0.09031602003121492


In [16]:
#Estimating the Accuracy of a Linear Regression Model
from sklearn.utils import resample
#From former assignments we have the linear regression model linearRegressionModel and the dataset AutoDataSet
LinearRegression = lm.fit(HorsePowerData, MPGData)
print(LinearRegression.intercept_,LinearRegression.coef_)

39.93586102117047 [-0.15784473]


In [17]:
HorsePower, MPG = resample(HorsePowerData, MPGData, n_samples=392)
LinearRegression = lm.fit(HorsePower,MPG)
print(LinearRegression.intercept_,LinearRegression.coef_)

39.71755179967372 [-0.15228939]


In [11]:
InterceptValues = []
CoefValues = []

for i in range(0,1000):
        HorsePower, MPG = resample(HorsePowerData, MPGData, n_samples=392)
        LinearRegression = lm.fit(HorsePower,MPG)
        InterceptValues.append(LinearRegression.intercept_)
        CoefValues.append(LinearRegression.coef_)
        
InterceptMean = np.mean(InterceptValues)
CoefMean = np.mean(CoefValues)
print("bootstrap intercept: "+ str(InterceptMean))
print("bootstrap coef:" + str(CoefMean))
print("STD intercept: " + str(np.std(InterceptValues)))
print("STD coef: " + str(np.std(CoefValues)))


bootstrap intercept: 39.94767233184664
bootstrap coef:-0.15816538383200823
STD intercept: 0.8312148882704347
STD coef: 0.007097013664807689
