In [14]:
/**
 * Run Ordinary Least Square regression on a data set and plot the result.
 *
 * @author Haksun Li
 */

%use s2, lets-plot, krangl

// download the data from the web
val df = DataFrame.readCSV("https://s2-nmdev.s3.eu-west-3.amazonaws.com/resources/linear-reg-data.csv")
// display (some of) the data
println(df)

// plot the data points
val p = lets_plot(df.toMap()) + ggsize(500, 250)
p + geom_point() {x = "x"; y = "y"}

// construct a linear model problem
val problem = LMProblem(
    DenseVector(df["y"].asDoubles()), // Y, the dependent variables
    DenseMatrix(DenseVector(df["x"].asDoubles())), // X, the independent variables
    true) // true if to additionally add an intercept term to the model
// run OLS regression
val ols = OLSRegression(problem)

// the regression coefficients
val beta_hat = ols.beta().betaHat()
// the error terms
val residuals = ols.residuals()
println("beta_0 = ${beta_hat[1]}, beta_1 = ${beta_hat[2]}")
println("R^2 = ${residuals.R2()}")
println("expected value of y at x = 50 is ${ols.Ey(DenseVector(50.0))}")

// plot the regression line
lets_plot(mapOf(
    "x" to df["x"].asDoubles(),
    "residuals" to residuals.residuals().toArray())) + ggsize(500, 200) + geom_point() {x = "x"; y = "residuals"}

val lineStartX = -1.0
val lineEndX = 101.0
val lineEnds = mapOf(
    "x" to listOf(lineStartX, lineEndX),
    "y" to listOf(ols.Ey(DenseVector(lineStartX)), ols.Ey(DenseVector(lineEndX)))
)

p + geom_point(alpha = 0.4) {x = "x"; y = "y"} +
    geom_line(data = lineEnds, size = 2) {x = "x"; y = "y"}

A DataFrame: 300 x 2
      x        y
 1   77   79.775
 2   21   23.177
 3   22   25.609
 4   20   17.857
 5   36    41.85
 6   15    9.805
 7   62   58.875
 8   95   97.618
 9   20   18.395
10    5    8.747
and 290 more rows
beta_0 = 1.0143353551195182, beta_1 = -0.4618107736611625
R^2 = 0.9891203611402716
expected value at x = 50 is 50.254956982314745
