In [1]:
#import libraries
import functions as func
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [2]:
#read data
df = pd.read_csv("houseprices.csv")
df.head()

Unnamed: 0,Id,LotArea,Street,LotShape,LandContour,Utilities,Neighborhood,BldgType,HouseStyle,OverallQual,...,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageType,GarageArea,MoSold,YrSold,SalePrice
0,1,8450,Pave,Reg,Lvl,AllPub,CollgCr,1Fam,2Story,7,...,3,1,Gd,8,0,Attchd,548,2,2008,208500
1,2,9600,Pave,Reg,Lvl,AllPub,Veenker,1Fam,1Story,6,...,3,1,TA,6,1,Attchd,460,5,2007,181500
2,3,11250,Pave,IR1,Lvl,AllPub,CollgCr,1Fam,2Story,7,...,3,1,Gd,6,1,Attchd,608,9,2008,223500
3,4,9550,Pave,IR1,Lvl,AllPub,Crawfor,1Fam,2Story,7,...,3,1,Gd,7,1,Detchd,642,2,2006,140000
4,5,14260,Pave,IR1,Lvl,AllPub,NoRidge,1Fam,2Story,8,...,4,1,Gd,9,1,Attchd,836,12,2008,250000


In [3]:
df = func.tidyData(df)
df.head()

Unnamed: 0,Id,LotArea,Street,LotShape,LandContour,Utilities,Neighborhood,BldgType,HouseStyle,OverallQual,...,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageType,GarageArea,MoSold,YrSold,SalePrice
0,1,8450,Pave,Reg,Lvl,AllPub,CollgCr,1Fam,2Story,7,...,3,1,Gd,8,0,Attchd,548,2,2008,1
1,2,9600,Pave,Reg,Lvl,AllPub,Veenker,1Fam,1Story,6,...,3,1,TA,6,1,Attchd,460,5,2007,0
2,3,11250,Pave,IR1,Lvl,AllPub,CollgCr,1Fam,2Story,7,...,3,1,Gd,6,1,Attchd,608,9,2008,1
3,4,9550,Pave,IR1,Lvl,AllPub,Crawfor,1Fam,2Story,7,...,3,1,Gd,7,1,Detchd,642,2,2006,0
4,5,14260,Pave,IR1,Lvl,AllPub,NoRidge,1Fam,2Story,8,...,4,1,Gd,9,1,Attchd,836,12,2008,1


In [4]:
scores = func.fitLogisticRegression(df, func.logScore)
scores

[29.772859181220817,
 43.71682900200041,
 20.182141594073055,
 26.346008714724523,
 19.773850598207186,
 19.922685736978348,
 50.6249935378715,
 30.356863419853358,
 34.808313008768124,
 20.79770926390778]

In [5]:
# encode categorical data as binary dummy variables
df = pd.get_dummies(df)

# Separating features and target
y = df["SalePrice"]
X = df.copy(deep=True)
X.drop(["SalePrice"], axis=1, inplace=True)

# Make cross validation generator
cv_generator = KFold(n_splits=10, shuffle=True, random_state=3)

# initialise list for scores
scores = []

# make model object
clf = LogisticRegression(random_state=3, class_weight="balanced")

# make and scoe model on each fold
for train_index, test_index in cv_generator.split(X):

    #get testing and training data in fold
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    #scale data
    X_train_scaled = StandardScaler().fit_transform(X_train)
    X_train = pd.DataFrame(X_train_scaled, columns=X.columns)

    X_test_scaled = StandardScaler().fit_transform(X_test)
    X_test = pd.DataFrame(X_test_scaled, columns=X.columns)

    # fit model
    clf.fit(X_train, y_train)

    # make prediction
    y_pred = clf.predict_proba(X_test)

    # score prediction
    score = func.brierScore(y_test.to_numpy() , y_pred)

    # add score to list
    scores.append(score)



In [6]:
scores

[0.06019837770146106,
 0.0791096923031067,
 0.044296784226596546,
 0.05936609673459879,
 0.04292630652963812,
 0.046076663493960404,
 0.08980650795217206,
 0.05938536580119842,
 0.07630252541547435,
 0.046423928467225074]