In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from sklearn.linear_model import LinearRegression
from scipy import optimize

### Linear regression example: Movie success prediction
(M Mestyán, T Yasseri, J Kertész - PloS one, 2013)

* Variables:
  - [V] Number of views of the Wikipedia page
  - [U] Number of editors of the Wikipedia page
  - [E] Number of edits made on the Wikipedia page
  - [R] Collaborative rigor of Wikipedia editing
  - [T] <span style="color:red">Number of theaters that screen the movie</span>
* Time${=}0$ day of release
* Coefficient of determination
* Relative importance of parameters can be read
* Coefficients also proportional to parameter importance
<img src="movie.png" width="640">


#### Generate random points

In [None]:
x = np.random.random((100,2))

#### Function with noise

In [None]:
y = x[:,0]*7 - x[:,1]*3 + np.random.random() - 0.5

In [None]:
np.shape(x)

In [None]:
fig = plt.figure()
ax = plt.axes(projection="3d")
ax.scatter(x[:,0], x[:,1], y, marker='o')
plt.show()

Linear model

$$y_i=w_0+w_1x_i+w_2x_i^2+\cdots+w_mx_i^m+\varepsilon_i$$

$$
\begin{pmatrix} y_1\cr y_2\cr \vdots\cr y_n \end{pmatrix} =
\begin{pmatrix}
1&x_1&x_1^2&\dots &x_1^m\cr
1&x_2&x_2^2&\dots &x_2^m\cr
\vdots&\vdots&\vdots&\ddots &\vdots\cr
1&x_n&x_n^2&\dots &x_n^m\cr
\end{pmatrix}
\begin{pmatrix} w_1\cr w_2\cr \vdots\cr w_n \end{pmatrix}
+
\begin{pmatrix} \varepsilon_1\cr \varepsilon_2\cr \vdots\cr
\varepsilon_n \end{pmatrix}
$$
Solve for $\mathbf{w}$:
$$\mathbf{w}=(X^TX)^{-1}X^T \mathbf{y}$$

#### Solve for w

In [None]:
w = XXX
print(w)

In [None]:
fig = plt.figure()
ax = plt.axes(projection="3d")
ax.scatter(x[:,0], x[:,1], y, marker='o')
ax.scatter(x[:,0], x[:,1], x[:,0] * XXX + x[:,1] * XXX, marker='o')
plt.show()

#### Internal linear regression

In [None]:
model = LinearRegression().fit(x, y)

In [None]:
print(model.score(x, y))
print('coeffs:', model.coef_)

#### Why is it better?
Reason internal linear regression also assumes a constant term add a contant term to x

Implement it!

### Titanic data
Important columns (starting from 0)
 * 1: survived (1: yes, 0: no)
 * 2: passenger class
 * 4: gender
 * 5: age
The 1 in the last columnt is for the constant part of the linear regression

In [None]:
f = open("titanic.csv","r")
f.readline() # header
x = np.zeros((891,4), dtype=float)
y = np.zeros(891,dtype=float)
i = 0
for line in f:
    n = line.split(";")
    y[i] = float(n[1])
    if n[5].isdigit():
        age = float(n[5])
    else:
        age = -1
    x[i] = [float(n[2]), float(n[4]=="male"),age,1 ]
    i += 1
f.close()
print(x[:6])

#### Exercise:
 * We have used -1 for unknown age. Replace it with the average age (averageof the known ones!)
 * Normalize the columns so that the coefficients can be compared

In [None]:
print(x[0:3],"\n---\n",y[0:3])

In [None]:
print(x[:,0].mean(),x[:,1].mean(),x[:,2].mean())

In [None]:
XXX our method

In [None]:
model = LinearRegression().fit(x, y)

In [None]:
r_sq = model.score(x, y)
print(r_sq)

In [None]:
print('coeffs:', model.coef_)

In [None]:
%timeit XXX our method

In [None]:
%timeit model = LinearRegression().fit(x, y)

### Logistic regression
Just for curiosity. For binary data it works better than linear regression.

Binary output $Y$. Probability of $Y$ to happen is:
$$p(Y=1)=\frac{1}{1+\exp{\sum \beta_i x_i}} $$
<img src="lin_log_reg.png" width="200">

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(x, y)
r_sq = model.score(x, y)
print(r_sq)
print('coeffs:', model.coef_)