In [None]:
import pandas as pd
import glob
from datetime import datetime
import matplotlib.pyplot as plt

out_dir = "../data" #output directory
# read data from all files
result = pd.concat(
    [pd.read_csv(file, encoding="utf-8", parse_dates=[1,2,5], dayfirst=True) for file in glob.glob(out_dir + "/*.csv")],
    ignore_index=True)

In [None]:
#group by time ranges
per = result.date.dt.to_period('M')
#dti =  pd.date_range(start=min(result.date), end=max(result.date), freq='2M')
df = result.groupby(by=['author', per, 'status']).agg({'factor':['sum', 'count']})
# tune up dataframe
df.columns = ["_".join(x) for x in df.columns.ravel()]
df.reset_index(inplace=True)
# make columns with detailed data on W-R-L
df = df.pivot_table(index=['author', 'date'], columns='status', values=['factor_sum', 'factor_count'])
df.columns = ["_".join(x) for x in df.columns.ravel()]
df = df.fillna(value=0)
# add summary columns
MIN_GAMES_THRES = 10 # too few games per period are not counted
df['count'] = [s if s >= MIN_GAMES_THRES else float('nan') for s in df['factor_count_L'] + df['factor_count_R'] + df['factor_count_W'] ]
df['return'] = df['factor_sum_W'] + df['factor_count_R']
df['win'] = df['factor_sum_W'] - df['factor_count_L'] - df['factor_count_W']
df['success_rate'] = df['win'] / df['count'] * 100
df.reset_index(inplace=True)
df

In [None]:
#plot success rate of some best authors
selected_authors = ['chaplygin'
                    , 'falcao1984'
                    , 'zhukov'
                    , 'stavkaprognozsport'
                    , 'teplofevralya']
df_plot = df[df.author.isin(selected_authors)] \
    [['author', 'date', 'success_rate']]
df_plot = df_plot.pivot(index='date', columns='author', values='success_rate')
df_plot.plot()

In [None]:
#interpolate NANs
df_plot.interpolate().plot()