Skip to content
This repository has been archived by the owner on Dec 8, 2022. It is now read-only.

Latest commit

 

History

History
636 lines (493 loc) · 34.6 KB

2020-10-24-pca-preparations.md

File metadata and controls

636 lines (493 loc) · 34.6 KB
keywords title nb_path layout
fastai
Principal Component Analysis from scatch - preparations
_notebooks/2020-10-24-pca-preparations.ipynb
notebook
{% raw %}
{% endraw %}

Libraries and helper functions

{% raw %}
import math as m
import random
import pandas as pd
import numpy as np

import altair as alt

</div>
{% endraw %}
{% raw %}
from typing import List
Vector = List[float]
</div>
{% endraw %}
{% raw %}
def add(vector1: Vector, vector2: Vector) -> Vector:
    assert len(vector1) == len(vector2)
    return [v1 + v2 for v1, v2 in zip(vector1, vector2)]
</div>
{% endraw %}
{% raw %}
def subtract(vector1: Vector, vector2:Vector) -> Vector:
    assert len(vector1) == len(vector2)
    return [v1 - v2 for v1, v2 in zip(vector1, vector2)]
</div>
{% endraw %}
{% raw %}
def vector_sum(vectors: List[Vector]) -> Vector:
    assert vectors
<span class="n">vector_length</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">vectors</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">assert</span> <span class="nb">all</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="o">==</span> <span class="n">vector_length</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">vectors</span><span class="p">)</span>

<span class="n">sums</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">*</span> <span class="n">vector_length</span>
<span class="k">for</span> <span class="n">vector</span> <span class="ow">in</span> <span class="n">vectors</span><span class="p">:</span>
    <span class="n">sums</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">sums</span><span class="p">,</span> <span class="n">vector</span><span class="p">)</span>

<span class="k">return</span> <span class="n">sums</span>

</div>
{% endraw %}
{% raw %}
def scalar_multiply(c: float, vector: Vector) -> Vector:
    return [c * v for v in vector]
</div>
{% endraw %}
{% raw %}
def vector_mean(vector: Vector) -> float:
    n = len(vector)
    return scalar_multiply(1/n, vector)
</div>
{% endraw %}
{% raw %}
def dot(vector1: Vector, vector2: Vector) -> float:
    assert len(vector1) == len(vector2)
    return sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
</div>
{% endraw %}
{% raw %}
def sum_of_squares(v: Vector) -> Vector:
    return dot(v, v)
</div>
{% endraw %}
{% raw %}
def magnitude(v: Vector) -> Vector:
    return m.sqrt(sum_of_squares(v))
</div>
{% endraw %}
{% raw %}
def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
    """Return vector adjusted with step. Step is gradient times step size.
    """
    step = scalar_multiply(step_size, gradient)
    return add(v, step)
</div>
{% endraw %}

Steps

{% raw %}
intercept = random.randint(-30, 30)
coefficient = random.uniform(-1, 1)
n = 30

xs = np.random.randint(-50, 10 + 1, 30) ys = np.random.randint(-20, 50 + 1, 30) df = pd.DataFrame({'x': xs, 'y': ys})

print(intercept, coefficient)

alt.Chart(df).mark_point().encode( alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y']) )

</div>
-10 0.9679420748641416
<script type="text/javascript"> (function(spec, embedOpt){ let outputDiv = document.currentScript.previousElementSibling; if (outputDiv.id !== "altair-viz-56aad99876ac4c958a43ace61d4f3628") { outputDiv = document.getElementById("altair-viz-56aad99876ac4c958a43ace61d4f3628"); } const paths = { "vega": "https://cdn.jsdelivr.net/npm//vega@5?noext", "vega-lib": "https://cdn.jsdelivr.net/npm//vega-lib?noext", "vega-lite": "https://cdn.jsdelivr.net/npm//vega-lite@4.8.1?noext", "vega-embed": "https://cdn.jsdelivr.net/npm//vega-embed@6?noext", };
function loadScript(lib) {
  return new Promise(function(resolve, reject) {
    var s = document.createElement('script');
    s.src = paths[lib];
    s.async = true;
    s.onload = () => resolve(paths[lib]);
    s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
    document.getElementsByTagName("head")[0].appendChild(s);
  });
}

function showError(err) {
  outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
  throw err;
}

function displayChart(vegaEmbed) {
  vegaEmbed(outputDiv, spec, embedOpt)
    .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
}

if(typeof define === "function" && define.amd) {
  requirejs.config({paths});
  require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
} else if (typeof vegaEmbed === "function") {
  displayChart(vegaEmbed);
} else {
  loadScript("vega")
    .then(() => loadScript("vega-lite"))
    .then(() => loadScript("vega-embed"))
    .catch(showError)
    .then(() => displayChart(vegaEmbed));
}

})({"config": {"view": {"continuousWidth": 400, "continuousHeight": 300}}, "data": {"name": "data-253ffb03c5c3da6161e4f4bd338d5909"}, "mark": "point", "encoding": {"tooltip": [{"type": "quantitative", "field": "x"}, {"type": "quantitative", "field": "y"}], "x": {"type": "quantitative", "field": "x"}, "y": {"type": "quantitative", "field": "y"}}, "$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json", "datasets": {"data-253ffb03c5c3da6161e4f4bd338d5909": [{"x": -34, "y": 0}, {"x": -17, "y": 40}, {"x": -11, "y": 35}, {"x": -36, "y": 49}, {"x": -32, "y": 4}, {"x": -3, "y": 25}, {"x": -19, "y": 43}, {"x": 6, "y": -8}, {"x": -11, "y": 7}, {"x": -36, "y": 8}, {"x": -34, "y": 29}, {"x": -35, "y": 0}, {"x": -16, "y": 21}, {"x": -10, "y": 15}, {"x": -8, "y": 44}, {"x": -7, "y": 7}, {"x": -47, "y": 49}, {"x": 7, "y": 35}, {"x": -16, "y": 40}, {"x": -27, "y": 35}, {"x": -44, "y": -14}, {"x": -30, "y": 12}, {"x": -24, "y": 40}, {"x": -1, "y": 31}, {"x": -29, "y": -2}, {"x": -28, "y": 31}, {"x": -32, "y": -9}, {"x": -49, "y": -7}, {"x": -41, "y": -19}, {"x": -13, "y": 20}]}}, {"mode": "vega-lite"}); </script>

{% endraw %}

De-meaning

{% raw %}
def de_mean(data: List[Vector]) -> List[Vector]:
    # mean = vector_mean(data)
    return [vector - np.mean(vector) for vector in data]
</div>
{% endraw %}
{% raw %}
xs_demean, ys_demean = de_mean([xs, ys])

df = pd.DataFrame({'x': xs_demean, 'y': ys_demean}) alt.Chart(df).mark_point().encode( alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y']) )

</div>
<script type="text/javascript"> (function(spec, embedOpt){ let outputDiv = document.currentScript.previousElementSibling; if (outputDiv.id !== "altair-viz-a4637d919bba492a97bea1d9c5bceef0") { outputDiv = document.getElementById("altair-viz-a4637d919bba492a97bea1d9c5bceef0"); } const paths = { "vega": "https://cdn.jsdelivr.net/npm//vega@5?noext", "vega-lib": "https://cdn.jsdelivr.net/npm//vega-lib?noext", "vega-lite": "https://cdn.jsdelivr.net/npm//vega-lite@4.8.1?noext", "vega-embed": "https://cdn.jsdelivr.net/npm//vega-embed@6?noext", };
function loadScript(lib) {
  return new Promise(function(resolve, reject) {
    var s = document.createElement('script');
    s.src = paths[lib];
    s.async = true;
    s.onload = () => resolve(paths[lib]);
    s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
    document.getElementsByTagName("head")[0].appendChild(s);
  });
}

function showError(err) {
  outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
  throw err;
}

function displayChart(vegaEmbed) {
  vegaEmbed(outputDiv, spec, embedOpt)
    .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
}

if(typeof define === "function" && define.amd) {
  requirejs.config({paths});
  require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
} else if (typeof vegaEmbed === "function") {
  displayChart(vegaEmbed);
} else {
  loadScript("vega")
    .then(() => loadScript("vega-lite"))
    .then(() => loadScript("vega-embed"))
    .catch(showError)
    .then(() => displayChart(vegaEmbed));
}

})({"config": {"view": {"continuousWidth": 400, "continuousHeight": 300}}, "data": {"name": "data-4e839f0369987c0fa0c1e6c372f962c7"}, "mark": "point", "encoding": {"tooltip": [{"type": "quantitative", "field": "x"}, {"type": "quantitative", "field": "y"}], "x": {"type": "quantitative", "field": "x"}, "y": {"type": "quantitative", "field": "y"}}, "$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json", "datasets": {"data-4e839f0369987c0fa0c1e6c372f962c7": [{"x": -11.433333333333334, "y": -18.7}, {"x": 5.566666666666666, "y": 21.3}, {"x": 11.566666666666666, "y": 16.3}, {"x": -13.433333333333334, "y": 30.3}, {"x": -9.433333333333334, "y": -14.7}, {"x": 19.566666666666666, "y": 6.300000000000001}, {"x": 3.5666666666666664, "y": 24.3}, {"x": 28.566666666666666, "y": -26.7}, {"x": 11.566666666666666, "y": -11.7}, {"x": -13.433333333333334, "y": -10.7}, {"x": -11.433333333333334, "y": 10.3}, {"x": -12.433333333333334, "y": -18.7}, {"x": 6.566666666666666, "y": 2.3000000000000007}, {"x": 12.566666666666666, "y": -3.6999999999999993}, {"x": 14.566666666666666, "y": 25.3}, {"x": 15.566666666666666, "y": -11.7}, {"x": -24.433333333333334, "y": 30.3}, {"x": 29.566666666666666, "y": 16.3}, {"x": 6.566666666666666, "y": 21.3}, {"x": -4.433333333333334, "y": 16.3}, {"x": -21.433333333333334, "y": -32.7}, {"x": -7.433333333333334, "y": -6.699999999999999}, {"x": -1.4333333333333336, "y": 21.3}, {"x": 21.566666666666666, "y": 12.3}, {"x": -6.433333333333334, "y": -20.7}, {"x": -5.433333333333334, "y": 12.3}, {"x": -9.433333333333334, "y": -27.7}, {"x": -26.433333333333334, "y": -25.7}, {"x": -18.433333333333334, "y": -37.7}, {"x": 9.566666666666666, "y": 1.3000000000000007}]}}, {"mode": "vega-lite"}); </script>

{% endraw %}

Direction

{% raw %}
def direction(w: Vector) -> Vector:
    mag = magnitude(w)
    return [w_i / mag for w_i in w]

direction(xs)

</div>
[-0.22863117335525085,
 -0.11431558667762542,
 -0.07396890902669881,
 -0.24208006590555972,
 -0.21518228080494198,
 -0.02017333882546331,
 -0.1277644792279343,
 0.04034667765092662,
 -0.07396890902669881,
 -0.24208006590555972,
 -0.22863117335525085,
 -0.2353556196304053,
 -0.10759114040247099,
 -0.06724446275154437,
 -0.053795570201235494,
 -0.04707112392608106,
 -0.31604897493225853,
 0.04707112392608106,
 -0.10759114040247099,
 -0.1815600494291698,
 -0.29587563610679524,
 -0.2017333882546331,
 -0.1613867106037065,
 -0.006724446275154437,
 -0.19500894197947868,
 -0.18828449570432423,
 -0.21518228080494198,
 -0.32949786748256743,
 -0.27570229728133194,
 -0.08741780157700768]
{% endraw %}
{% raw %}
xs_dir = direction(xs_demean)
ys_dir = direction(ys_demean)

df = pd.DataFrame({'x': xs_dir, 'y': ys_dir}) alt.Chart(df).mark_point().encode( alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y']) )

</div>
<script type="text/javascript"> (function(spec, embedOpt){ let outputDiv = document.currentScript.previousElementSibling; if (outputDiv.id !== "altair-viz-cc68fd51365d4640bc10331329a00f15") { outputDiv = document.getElementById("altair-viz-cc68fd51365d4640bc10331329a00f15"); } const paths = { "vega": "https://cdn.jsdelivr.net/npm//vega@5?noext", "vega-lib": "https://cdn.jsdelivr.net/npm//vega-lib?noext", "vega-lite": "https://cdn.jsdelivr.net/npm//vega-lite@4.8.1?noext", "vega-embed": "https://cdn.jsdelivr.net/npm//vega-embed@6?noext", };
function loadScript(lib) {
  return new Promise(function(resolve, reject) {
    var s = document.createElement('script');
    s.src = paths[lib];
    s.async = true;
    s.onload = () => resolve(paths[lib]);
    s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
    document.getElementsByTagName("head")[0].appendChild(s);
  });
}

function showError(err) {
  outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
  throw err;
}

function displayChart(vegaEmbed) {
  vegaEmbed(outputDiv, spec, embedOpt)
    .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
}

if(typeof define === "function" && define.amd) {
  requirejs.config({paths});
  require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
} else if (typeof vegaEmbed === "function") {
  displayChart(vegaEmbed);
} else {
  loadScript("vega")
    .then(() => loadScript("vega-lite"))
    .then(() => loadScript("vega-embed"))
    .catch(showError)
    .then(() => displayChart(vegaEmbed));
}

})({"config": {"view": {"continuousWidth": 400, "continuousHeight": 300}}, "data": {"name": "data-4cbdaba46d25db68017e0689aa4af2c4"}, "mark": "point", "encoding": {"tooltip": [{"type": "quantitative", "field": "x"}, {"type": "quantitative", "field": "y"}], "x": {"type": "quantitative", "field": "x"}, "y": {"type": "quantitative", "field": "y"}}, "$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json", "datasets": {"data-4cbdaba46d25db68017e0689aa4af2c4": [{"x": -0.1382701487741325, "y": -0.1699838452782193}, {"x": 0.06732103453434438, "y": 0.19361796280353322}, {"x": 0.13988262864321857, "y": 0.14816773679331416}, {"x": -0.16245734681042387, "y": 0.2754283696219275}, {"x": -0.11408295073784108, "y": -0.13362366447004403}, {"x": 0.23663142078838414, "y": 0.05726728477287603}, {"x": 0.04313383649805298, "y": 0.22088809840966467}, {"x": 0.3454738119516954, "y": -0.24270420689456979}, {"x": 0.13988262864321857, "y": -0.1063535288639126}, {"x": -0.16245734681042387, "y": -0.0972634836618688}, {"x": -0.1382701487741325, "y": 0.09362746558105128}, {"x": -0.15036374779227818, "y": -0.1699838452782193}, {"x": 0.07941463355249008, "y": 0.020907103964700777}, {"x": 0.15197622766136426, "y": -0.0336331672475621}, {"x": 0.17616342569765567, "y": 0.22997814361170846}, {"x": 0.18825702471580136, "y": -0.1063535288639126}, {"x": -0.29548693601002657, "y": 0.2754283696219275}, {"x": 0.3575674109698411, "y": 0.14816773679331416}, {"x": 0.07941463355249008, "y": 0.19361796280353322}, {"x": -0.0536149556471126, "y": 0.14816773679331416}, {"x": -0.25920613895558947, "y": -0.2972444781068327}, {"x": -0.08989575270154969, "y": -0.06090330285369354}, {"x": -0.0173341585926755, "y": 0.19361796280353322}, {"x": 0.26081861882467555, "y": 0.1118075559851389}, {"x": -0.07780215368340399, "y": -0.1881639356823069}, {"x": -0.0657085546652583, "y": 0.1118075559851389}, {"x": -0.11408295073784108, "y": -0.2517942520966136}, {"x": -0.31967413404631795, "y": -0.233614161692526}, {"x": -0.22292534190115237, "y": -0.3426947041170518}, {"x": 0.11569543060692716, "y": 0.011817058762656964}]}}, {"mode": "vega-lite"}); </script>

{% endraw %}