In [19]:
# This notebook is to test online linear regression. I want to get it working here
# before trying to integrate into the sentiment analysis.

import numpy as np
import random
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from scipy import linalg
import sympy

In [9]:
# Online linear regression with a single feature is demonstrated here:
# https://stackoverflow.com/questions/52070293/efficient-online-linear-regression-algorithm-in-python
# Following is a direct lifting of the code in the first answer

def lr(x_avg,y_avg,Sxy,Sx,n,new_x,new_y):
    """
    x_avg: average of previous x, if no previous sample, set to 0
    y_avg: average of previous y, if no previous sample, set to 0
    Sxy: covariance of previous x and y, if no previous sample, set to 0
    Sx: variance of previous x, if no previous sample, set to 0
    n: number of previous samples
    new_x: new incoming 1-D numpy array x
    new_y: new incoming 1-D numpy array x
    """
    new_n = n + len(new_x)

    new_x_avg = (x_avg*n + np.sum(new_x))/new_n
    new_y_avg = (y_avg*n + np.sum(new_y))/new_n

    if n > 0:
        x_star = (x_avg*np.sqrt(n) + new_x_avg*np.sqrt(new_n))/(np.sqrt(n)+np.sqrt(new_n))
        y_star = (y_avg*np.sqrt(n) + new_y_avg*np.sqrt(new_n))/(np.sqrt(n)+np.sqrt(new_n))
    elif n == 0:
        x_star = new_x_avg
        y_star = new_y_avg
    else:
        raise ValueError

    new_Sx = Sx + np.sum((new_x-x_star)**2)
    new_Sxy = Sxy + np.sum((new_x-x_star).reshape(-1) * (new_y-y_star).reshape(-1))

    beta = new_Sxy/new_Sx
    alpha = new_y_avg - beta * new_x_avg
    return new_Sxy, new_Sx, new_n, alpha, beta, new_x_avg, new_y_avg

# Example of online linear regression applied to 101 batches of random data.
x_avg, y_avg, Sxy, Sx, n = 0,0,0,0,0
random.seed(1234)
X = np.array([random.random() for i in range(10)])
y = np.array([random.random() + 5*X[i] for i in range(10)])

X_total = X
y_total = y

Sxy, Sx, n, alpha, beta, x_avg, y_avg = lr(x_avg,y_avg,Sxy,Sx,n, X,y)

for i in range(100):
    X = np.array([random.random() for i in range(10)])
    X_total = np.append(X_total, X)
    y = np.array([random.random() + 5*X[i] for i in range(10)])
    y_total = np.append(y_total, y)
    Sxy, Sx, n, alpha, beta, x_avg, y_avg = lr(x_avg,y_avg,Sxy,Sx,n, X,y)
    
# Results. alpha and beta are, respectively, the intercept and coefficient of the regression.
Sxy, Sx, n, alpha, beta, x_avg, y_avg

(416.15301008042337,
 82.8002593290743,
 1010,
 0.49901136647134914,
 5.025986795844447,
 0.506835402394537,
 3.046359406572799)

In [10]:
# Use scikit learn's linear model to validate the above algorithm
# The intercept and coefficient should match the alph and beta values, respectively, found above.
regr = linear_model.LinearRegression()
regr.fit(X_total.reshape(-1,1), y_total)
[regr.intercept_,regr.coef_]

[0.4990113664713478, array([5.0259868])]

In [6]:
# Now we implement multidimensional regression:
# The following is based on a formula for regression coefficient, given for example in the following:
# https://stattrek.com/multiple-regression/regression-coefficients.aspx

In [60]:
# Example for testing
y_base = np.array([10,11,12,7,7])
X_base = np.array([[1,2],[0,1],[3,5],[2,1],[3,3]])
y = y_base.reshape(1,len(y_base)).transpose()
X = np.concatenate(([[1]]*len(X_base),X_base), axis=1)
XX = np.zeros( ( len(X[0]) , len(X[0]) ) )
Xy = np.zeros( ( len(X[0]) , 1 ) )

def lr_multi(XX,Xy,X,y, calc_results=False):
    XX = np.add(XX, np.matmul(X.transpose(),X))
    Xy = np.add(Xy, np.matmul(X.transpose(),y))
    if (calc_results):
        lin_ind_cols = sympy.Matrix(XX).T.rref()[1]
        XX_reduced = [[XX[i][j] for j in range(len(XX[0])) if j in lin_ind_cols] for i in range(len(XX)) if i in lin_ind_cols]
        Xy_reduced = [[Xy[i][0]] for i in range(len(XX)) if i in lin_ind_cols]
        return XX, Xy, np.matmul( np.linalg.inv(XX_reduced), Xy_reduced )
    else:
        return XX, Xy, None

# Split into 2
X1, X2, y1, y2 = X[:3], X[3:], y[:3], y[3:]
XX, Xy, _ = lr_multi(XX,Xy,X1,y1)
XX, Xy, results = lr_multi(XX,Xy,X2,y2, True)
# 'results' is an array. The first element is the intercept, and subsequent elements are
# the cofficients of the various features.
results

array([[ 9.04545455],
       [-2.27272727],
       [ 1.85227273]])

In [12]:
# Validate with scikit learn. Should match the intercept and coefficients found above.
regr = linear_model.LinearRegression()
regr.fit(X_base, y_base)
[regr.intercept_, regr.coef_]

[9.045454545454543, array([-2.27272727,  1.85227273])]

In [69]:
# Another example
# Should get a singular matrix X^TX.

y_base = np.array([10,11,12,7,7])
X_base = np.array([[1,2],[2,4],[1,2],[3,6],[1,2]]) 
y = y_base.reshape(1,len(y_base)).transpose()
X = np.concatenate(([[1]]*len(X_base),X_base), axis=1)
XX = np.zeros( ( len(X[0]) , len(X[0]) ) )
Xy = np.zeros( ( len(X[0]) , 1 ) )

X1,X2,y1,y2 = X[:2],X[2:],y[:2],y[2:]

XX, Xy, _ = lr_multi(XX,Xy,X1,y1)
XX, Xy, results = lr_multi(XX,Xy,X2,y2,True)
# These results are not the same as the results given by scikit-learn's regression.
# In the event of the X^TX matrix being singular (linear dependency among features), there is not an umbiguous
# regression that minimizes error.
results

array([[11.],
       [-1.]])