In [None]:
import numpy as np

# Running mean

* [Incremental averageing](https://math.stackexchange.com/a/1836447/315246)
* [Incremental means and variances](https://datagenetics.com/blog/november22017/index.html)

$
\begin {align*}
\sum_{i=1}^n x_i &= n * \mu_{n} = x_n + (n-1) * \mu_{n-1} \\
\mu_{n}&= \frac {x_n}{n} + \frac {(n-1)}{n} * \mu_{n-1} = \frac {x_n}{n} + \mu_{n-1} - \frac {\mu_{n-1}}{n} \\
\mu_{n}&=\mu_{n-1} + \frac{1}{n}( x_{n} - \mu_{n-1} )
\end {align*}
$

# Running mean of batch data X of size N

Batch $X_i$ and batch size $N_i$. Total number of ```X``` = $
\begin {align*}
L_k = \sum \limits ^k_i N_i
\end {align*}
$

Running mean of the entire X with a new batch $X_{k+1}$.

$
\begin {align*}
\mu _{k} 
&= \frac {L_{k-1} * \mu_{k-1} + \sum\limits X_{k}}{L_{k-1} + N_{k}} \\
&= \frac {(L_{k-1} + N_{k}) * \mu_{k-1} - N_{k} * \mu_{k-1} + \sum\limits X_{k}}{L_{k-1} + N_{k}} \\
&= \mu_{k-1} + \frac {\sum\limits X_{k} - N_{k} * \mu_{k-1}}{L_{k}}
\end {align*}
$

In [3]:
#!/usr/bin/python
"""
Requirements:
Parse the csv files 'data.csv'. Print out the:
a) row (game) count
b) average of "Game Length"
Skip malformed game data
Example input
"Game Number", "Game Length"
1, 30
2, 29
3, 31
4, 16
5, 24
6, 29
7, 28
8, 117
from thousands to millions of rows
"""
import csv
import numpy
import pandas as pd

def _to_Num(s):
    try:
        return int(s)
    except Exception as e:
        try:
            return float(s)
        except ValueError:
            return 0


def _is_ints(s):
    return [_to_Num(x) for x in s]


def stream(path):
    with open(path, 'rt+') as f:
        # Check if there is a header row
        has_header = csv.Sniffer().has_header(f.read(1024))

        # Reset the FP
        f.seek(0)
        lines = csv.reader(f, skipinitialspace=True)

        # Skip header if exists
        if has_header:
            next(lines)

        # yield rows
        for row in lines:
            if len(row) == 2:
                yield _is_ints(row)


avg_length = 0
count = 0

for game_number, game_length in stream('data.csv'):
    try:
        x = _to_Num(game_length)
        count += 1
        avg_length += (x - avg_length) / count
    except Exception as e:  # Could not cast to int. Row is malformed.
        print(e)
        continue

print(f"Games processed: {count}")  # Skip the header
print(f"Average game length: {avg_length}")
print("DONE")

Games processed: 8
Average game length: 38.0
DONE


In [6]:
df = pd.read_csv('data.csv', quotechar='"')
df

Unnamed: 0,GameNumber,GameLength
0,1,30
1,2,29
2,3,31
3,4,16
4,5,24
5,6,29
6,7,28
7,8,117


In [7]:
df['GameLength'].values.mean()

38.0