# Find the streak for a team
Take the last 7 games and calculate the fraction that have been won, lost and drawn.

In [4]:
import dataset_streak

import csv
import datetime
from functools import reduce


class Dataset:
    def __init__(self, file_path):
        self.raw_results = []
        self.processed_results = []

        with open(file_path) as stream:
            reader = csv.DictReader(stream)

            for row in reader:
                row['Date'] = datetime.datetime.strptime(row['Date'], '%d/%m/%y')
                self.raw_results.append(row)

        for result in self.raw_results:
            home_statistics = self.get_statistics(result['HomeTeam'], result['Date'])

            if home_statistics is None:
                continue

            away_statistics = self.get_statistics(result['AwayTeam'], result['Date'])

            if away_statistics is None:
                continue

            processed_result = {
                'result': result['FTR'],
                'odds-home': float(result['B365H']),
                'odds-draw': float(result['B365D']),
                'odds-away': float(result['B365A']),
            }

            for label, statistics in [('home', home_statistics), ('away', away_statistics)]:
                for key in statistics.keys():
                    processed_result[label + '-' + key] = statistics[key]

            self.processed_results.append(processed_result)

    # Filter results to only contain matches played in by a given team, before a given date
    def filter(self, team, date):
        def filter_fn(result):
            return (
                result['HomeTeam'] == team or
                result['AwayTeam'] == team
            ) and (result['Date'] < date)

        return list(filter(filter_fn, self.raw_results))

    # Calculate team statistics
    def get_statistics(self, team, date, matches=10):
        recent_results = self.filter(team, date)

        if len(recent_results) < matches:
            return None

        # This function maps a result to a set of performance measures roughly scaled between -1 and 1
        def map_fn(result):
            if result['HomeTeam'] == team:
                team_letter, opposition_letter = 'H', 'A'
                opposition = result['AwayTeam']
            else:
                team_letter, opposition_letter = 'A', 'H'
                opposition = result['HomeTeam']

            goals = int(result['FT{}G'.format(team_letter)])
            shots = int(result['{}S'.format(team_letter)])
            shots_on_target = int(result['{}ST'.format(team_letter)])
            shot_accuracy = shots_on_target / shots if shots > 0 else 0

            opposition_goals = int(result['FT{}G'.format(opposition_letter)])
            opposition_shots = int(result['{}S'.format(opposition_letter)])
            opposition_shots_on_target = int(result['{}ST'.format(opposition_letter)])

            return {
                'wins': 1 if result['FTR'] == team_letter else 0,
                'draws': 1 if result['FTR'] == 'D' else 0,
                'losses': 1 if result['FTR'] == opposition_letter else 0,
                'goals': int(result['FT{}G'.format(team_letter)]),
                'opposition-goals': int(result['FT{}G'.format(opposition_letter)]),
                'shots': int(result['{}S'.format(team_letter)]),
                'shots-on-target': int(result['{}ST'.format(team_letter)]),
                'opposition-shots': int(result['{}S'.format(opposition_letter)]),
                'opposition-shots-on-target': int(result['{}ST'.format(opposition_letter)]),
            }

        def reduce_fn(x, y):
            result = {}

            for key in x.keys():
                result[key] = x[key] + y[key]

            return result

        return reduce(reduce_fn, map(map_fn, recent_results[-matches:]))


In [7]:
data = dataset_streak.Dataset('data/book.csv')

<dataset_streak.Dataset at 0x2022eb029e8>

In [8]:
data.processed_results

[{'result': 'H',
  'odds-home': 1.17,
  'odds-draw': 6.5,
  'odds-away': 21.0,
  'home-wins': 7,
  'home-draws': 2,
  'home-losses': 1,
  'home-goals': 22,
  'home-opposition-goals': 4,
  'home-shots': 178,
  'home-shots-on-target': 92,
  'home-opposition-shots': 92,
  'home-opposition-shots-on-target': 52,
  'away-wins': 3,
  'away-draws': 3,
  'away-losses': 4,
  'away-goals': 9,
  'away-opposition-goals': 11,
  'away-shots': 105,
  'away-shots-on-target': 52,
  'away-opposition-shots': 124,
  'away-opposition-shots-on-target': 81},
 {'result': 'D',
  'odds-home': 2.0,
  'odds-draw': 3.3,
  'odds-away': 4.0,
  'home-wins': 4,
  'home-draws': 1,
  'home-losses': 5,
  'home-goals': 10,
  'home-opposition-goals': 15,
  'home-shots': 110,
  'home-shots-on-target': 64,
  'home-opposition-shots': 151,
  'home-opposition-shots-on-target': 83,
  'away-wins': 4,
  'away-draws': 0,
  'away-losses': 6,
  'away-goals': 14,
  'away-opposition-goals': 18,
  'away-shots': 116,
  'away-shots-on-targ

In [17]:
#with open("data/out.csv","r+") as fout:
    

fout=open("data/out.csv","a")
# first file:
for line in open("data/00.csv"):
    fout.write(line)
# now the rest:    
for num in range(1,18):
    f = open('data/' + str(num).zfill(2)+".csv", 'r+')
    next(f) # skip the header
    for line in f:
         fout.write(line)
    f.close() # not really needed
fout.close()

In [18]:
import os

csv_header = 'Timestamp,Client IP,Web Service,Status,Good,Bad'
csv_out = 'data/consolidated.csv'

csv_dir = os.getcwd()

dir_tree = os.walk(csv_dir)
for dirpath, dirnames, filenames in dir_tree:
    pass

csv_list = []
for file in filenames:
    if file.endswith('.csv'):
        csv_list.append(file)

csv_merge = open(csv_out, 'w')
csv_merge.write(csv_header)
csv_merge.write('\n')

for file in csv_list:
    csv_in = open(file)
    for line in csv_in:
        if line.startswith(csv_header):
            continue
        csv_merge.write(line)
    csv_in.close()
    csv_merge.close()
print('Verify consolidated CSV file : ' + csv_out)

Verify consolidated CSV file : consolidated.csv
