### Packages ###

In [1]:
#r "nuget: FSharp.Data"
#r "nuget: FSharp.Stats"
#r "nuget: Plotly.NET"

open FSharp.Stats
open FSharp.Stats.Algebra.LinearAlgebra
open FSharp.Data

### Types ###

In [4]:
type Vec = Vector<float>

type Mat = Matrix<float>

type Data = CsvProvider<"../Data/kc_house_data.csv", ResolutionFolder=__SOURCE_DIRECTORY__>

// single observation
type Obs = Data.Row

type HousePrice = float

type Theta = float

type Feature = float

type Features = Feature list

type Featurizer = Obs -> Features

type Model = Obs -> HousePrice

### Helper functions ###

In [11]:
let rng = Random(314159)

let shuffle (arr: 'a[]) =
    let arr = Array.copy arr
    let l = arr.Length

    for i in (l - 1) .. -1 .. 1 do
        let tmp = arr[i]
        let j = rng.Next(0, i + 1)
        arr[i] <- arr[j]
        arr[j] <- tmp

    arr

let loadData () =
    Data.Load(__SOURCE_DIRECTORY__ + "/../Data/kc_house_data.csv")

let splitData (rate: float) (data: Data) =
    let shuffled = data.Rows |> Seq.toArray |> shuffle
    let size = rate * float shuffled.Length |> int
    shuffled[..size], shuffled[size + 1 ..]

let evaluate (model: Model) (data: Obs seq) =
    data |> Seq.averageBy (fun obs -> abs (model obs - float obs.Price))
    
let estimateTheta (X: Mat) (Y: Vec) =
    let XT = X |> Matrix.transpose

    ((XT * X) |> Inverse) * XT * Y
    
let train (f: Featurizer) (data: Obs seq)  =
    let Yt,Xt =
        data
        |> Seq.toList
        |> List.map (fun obs -> obs.Price, f obs)
        |> List.unzip
    
    let thetas = estimateTheta (matrix Xt) (vector Yt)
    
    thetas, fun obs -> Vector.dot (f obs |> vector) thetas

### Test featurizers ###

In [12]:
let f1 (obs: Obs) = [ 1.0; obs.Sqft_living ]

let f2 (obs: Obs) =
    [ 1.0; obs.Sqft_living; obs.Lat |> float; obs.Long |> float ]

let f3 (obs: Obs) =
    [ 1.0
      obs.Sqft_living
      obs.Lat |> float
      obs.Long |> float
      float obs.Bathrooms
      float obs.Bedrooms
      float obs.Grade
      obs.Floors |> float
    ]

let f4 (obs: Obs) =
    [ 1.0
      obs.Sqft_living
      float obs.Bathrooms
      float obs.Bedrooms
      float obs.Grade
      float obs.Sqft_above
      float obs.Sqft_basement
      float obs.Sqft_living15
      float obs.Sqft_lot
      float obs.Sqft_lot15
      float obs.Yr_built
      float obs.Yr_renovated ]

### Let's load some data ###

In [13]:
let allData = loadData()

let training, validation = allData |> splitData 0.7

### Let's train some models ###

In [14]:
let thetas1, model1 = train f1 training
let thetas2, model2 = train f2 training
let thetas3, model3 = train f3 training
let thetas4, model4 = train f4 training

### Let's evaluate them ###

In [15]:
evaluate model1 validation |> printfn "Model1: %A \n"
evaluate model2 validation |> printfn "Model2: %A \n"
evaluate model3 validation |> printfn "Model3: %A \n"
evaluate model4 validation |> printfn "Model4: %A \n"


Model1: 172790.112 

Model2: 148864.4386 

Model3: 140574.1888 

Model4: 1868926.713 

