In [59]:
import pandas as pd 
from tqdm import tqdm 
from pathlib import Path
from typing import Dict, List
import re 
from collections import defaultdict
import os
import re
import logging

from dotenv import load_dotenv
import openai
from tqdm import tqdm
import pandas as pd

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
output_handler = logging.StreamHandler()
output_handler.setLevel(logging.INFO)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

load_dotenv('../openai.env')
openai.api_key  = os.getenv('API_KEY')
logger = logging.getLogger()

### Utils

In [147]:
def min_to_second(string: str):
    """
    Перевеод формата времени матча в секунды
    
    Аргументы:
        string: str - формат времени матча 
    Возвращает:
        время в секундах (int)
    """
    
    if isinstance(string, str):
        pattern = r"(\d+)\.(\d+):(\d+)"

        match = re.match(pattern, string)
        if match:
            minutes = int(match.group(1))*60
            seconds = int(match.group(3))
            return minutes+seconds
    else:
        return 0
    
    
def get_file_list(path_docs: str) -> List[str]:
    """
    Получитьл список всех файлов в папке
    
    Аргументы:
        path_docs - путь до папки с исходниками для рассчёта статистики  
    Возвращает:
        список путей исходников
    """
    
    path = Path(path_docs)
    parent =  path.parent
    name = path.name
    return [parent/name/path for path in os.listdir(path)]

class StatsHolder:
    """
    Класс для хранение и обработки сигнальных показателей
    """
    
    target_columns: list= ['MIN', 'FGM','FGA','FG3M','FTM','FTA','FT_PCT', 
                           'OREB','DREB','REB','AST','STL','BLK','TO','PF','PTS', 'PLAYER_ID', 'GAME_ID']
    calculus_columns: list = ['MIN', 'FGM','FGA','FG3M','FTM','FTA',
                              'FT_PCT','OREB','DREB','REB','AST','STL','BLK','TO','PF','PTS']
    labels_columns: list = ['PLAYER_ID', 'GAME_ID']
    
    def __init__(self, players_stats: Dict[int, List[pd.Series]]):
        """
        Аргументы:
            players_stats - статистика по игрокам. Ключи - индексы игроков, 
                            значения - сисок статистики по игроку за все указанные игры
        """
        
        self.players_stats: Dict[int, List[pd.Series]] = players_stats

    
    @classmethod
    def from_csv(cls, path_docs: str) -> 'StatsHolder':
        """ 
        Создание объекта StatsHolder из списка файлов со статистикой формата .csv
        """
       
        file_list = get_file_list(path_docs)[:100]
        players_stats = defaultdict(list)
        
        for path in tqdm(file_list, total=len(file_list)):
            df = pd.read_csv(path).fillna(0)
            
            if 'PLAYER_ID' not in df.columns:
                continue
            
            df['MIN'] = df['MIN'].apply(min_to_second)
            df[cls.calculus_columns] =df[cls.calculus_columns].fillna(0).astype(int)
            for index, row in df.iterrows():
                players_stats[row.PLAYER_ID].append(row[cls.target_columns])
                
        return cls(players_stats=players_stats)
        
        
    def add_record(self, record: pd.DataFrame) -> None:
        """
        Добавить запись к общецй статистике игроков
        
        Аргументы:
            record: pd.DataFrame - запись конкретной игры
        """
        
        for _, row in record[self.target_columns].iterrows():
            player_id = row.PLAYER_ID
            game_id = row.GAME_ID

            id_game_list = [stat.GAME_ID for stat in  self.players_stats[player_id]]
             
            if game_id in id_game_list:
                logger.info(f'This GAME_ID:{game_id} is already in the dataset') 
                return None
            
            self.players_stats[player_id].append(row)
            
        logger.info(f'added game_id: {game_id}') 
                       
        
    def set_strategy(self):
        """
        Установить стратегию срабатывания сигнальных показателей
        """
        
        pass
    
    def _calculate_rating(self) -> Dict[str, pd.DataFrame]:
        """
        Рассчёт рейтинга для явсе игроков по всем показателям
        
        Возвращает:
            словарь, где:
                ключь - это наименование индикатора (MIN, PTS и т.п.)
                значение - это DataFrame со следующими колонками:
                    PLAYER_ID - идентификатор игрока
                    VALUE - суммарное значение показателя игрока за весь период рассчёта
                    RATING - какое место занимает игрок в рейтинге по этому показателю \
                        относительно других игроков (0 - самое высокое место)            
        """
        
        # агрегация (суммирование) по игрокам 
        df_all = pd.concat([pd.DataFrame(records) for _, records in self.players_stats.items()], axis=0)
        df_all = df_all.groupby('PLAYER_ID').sum()
        #  получение словаря с рейтингами игроков для каждого сигнального показателя
        ratings = {col: df_all[col].sort_values(ascending=False).to_frame().reset_index()\
                    .rename(columns={col:'VALUE'}).assign(RATING=range(len(df_all[col]))) for col in df_all.columns} 
                                                                 
        return ratings

In [150]:
stats = StatsHolder.from_csv(path_docs='resource/boxscoretraditionalv2/')

100%|██████████| 100/100 [00:02<00:00, 46.91it/s]


In [164]:
record = pd.read_csv(r'resource\boxscoretraditionalv2\boxscoretraditionalv2_0_0012100003.csv') 
record['MIN'] = record['MIN'].apply(min_to_second)
record[stats.calculus_columns] = record[stats.calculus_columns].fillna(0).astype(int)
record['GAME_ID'] = 21211221
record.loc[0,'PLAYER_ID'] = 1627750
record.loc[0,'MIN'] = 10000000000


  record.loc[0,'MIN'] = 10000000000


In [165]:
stats.add_record(record)

2023-10-24 14:23:51,990 - INFO - added game_id: 21211221


In [166]:
stats._calculate_rating()['MIN']

Unnamed: 0,PLAYER_ID,VALUE,RATING
0,1627750,20000000000,0
1,1630596,13991,1
2,1630567,13747,2
3,1628401,13664,3
4,1627734,13341,4
...,...,...,...
601,1630607,0,601
602,200752,0,602
603,1628400,0,603
604,203967,0,604


In [146]:
df_all = pd.concat([pd.DataFrame(records) for _, records in stats.players_stats.items()], axis=0)
df_all = df_all.groupby('PLAYER_ID').sum()
ratings = {col: df_all[col].sort_values(ascending=False).to_frame().reset_index().rename(columns={col:'VALUE'}).assign(RATING=range(len(df_all[col]))) for col in df_all.columns} 
ratings['MIN']


Unnamed: 0,PLAYER_ID,VALUE,RATING
0,1627750,11000000000,0
1,202681,3000000000,1
2,1628369,14139,2
3,1630596,13991,3
4,1630567,13747,4
...,...,...,...
601,1628380,0,601
602,202691,0,602
603,1630618,0,603
604,200752,0,604


In [127]:
[df_all[col].sort_values(ascending=False).to_frame().reset_index().rename(columns={col:'VALUE'}) for col in df_all.columns][0]

Unnamed: 0,index,VALUE
0,0,1000000000
1,15,2823
2,16,2759
3,3,2757
4,4,2755
...,...,...
3480,25,0
3481,33,0
3482,32,0
3483,13,0


In [111]:
ratings = {col: df_all[col].sort_values(ascending=False).to_frame().assign(player_id=df_all[col].index,values=df_all[col],rating=range(len(df_all)))[['player_id','values', 'rating']].reset_index(drop=True) for col in df_all.columns} 


ValueError: cannot reindex on an axis with duplicate labels

In [110]:
ratings

Unnamed: 0,index,VALUE,RATING
0,0,1000000000,0
1,15,2823,1
2,16,2759,2
3,3,2757,3
4,4,2755,4
...,...,...,...
3480,25,0,3480
3481,33,0,3481
3482,32,0,3482
3483,13,0,3483


In [100]:
col = 'MIN'
[df_all[col].sort_values(ascending=False).to_frame().reset_index().rename(columns={col:'VALUE'}).assign(RATING=range(len(df_all[col]))) for col in df_all.columns]


Unnamed: 0,PLAYER_ID,VALUE,RATING
0,1630835,1000000185,0
1,1630596,13991,1
2,1630567,13747,2
3,1628401,13664,3
4,1627734,13341,4
...,...,...,...
601,1627750,0,601
602,1630179,0,602
603,1628380,0,603
604,1630562,0,604


In [81]:
df_all[col]

PLAYER_ID
2544             8796
2546             8858
2617              805
2730             5784
2738             7375
              ...    
1630791             0
1630792             0
1630793             0
1630801             0
1630835    1000000185
Name: MIN, Length: 606, dtype: int64

In [93]:
[df_all[col].sort_values(ascending=False).to_frame().reset_index().rename(columns={col:'VALUE'}) for col in df_all.columns][0]

Unnamed: 0,PLAYER_ID,VALUE
0,1630835,1000000185
1,1630596,13991
2,1630567,13747
3,1628401,13664
4,1627734,13341
...,...,...
601,1627750,0
602,1630179,0
603,1628380,0
604,1630562,0


In [51]:
x.assign(r = x.index)

Unnamed: 0_level_0,MIN,player_id,r
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1630835,1000000185,2544,1630835
1630596,13991,2546,1630596
1630567,13747,2617,1630567
1628401,13664,2730,1628401
1627734,13341,2738,1627734
...,...,...,...
1627750,0,1630791,1627750
1630179,0,1630792,1630179
1628380,0,1630793,1628380
1630562,0,1630801,1630562


In [39]:
df_all = df_all.groupby('PLAYER_ID').sum()
df_all

Unnamed: 0_level_0,MIN,FGM,FGA,FG3M,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,GAME_ID
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2544,8796,44,89,13,14,18,3,1,27,28,22,7,3,20,14,115,116800222
2546,8858,28,69,16,10,11,4,3,20,23,4,1,2,7,16,82,116800222
2617,805,0,4,0,1,2,0,0,4,4,3,0,2,1,1,1,104700230
2730,5784,17,32,0,16,23,0,16,27,43,5,6,6,9,23,50,116800222
2738,7375,9,21,3,2,2,1,2,21,23,20,5,2,6,9,23,104700189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630791,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12100043
1630792,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12100043
1630793,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24200109
1630801,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12100055


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS
0,212112212421312966669,1610612753,ORL,Orlando,1630835,Terrence Ross,Terrence,F,,1000000000,2,9,0.222,0,2.0,0.0,1,2,0,1,3,4,1,2,1,0,3,5,-8.0
1,212112212421312966669,1610612753,ORL,Orlando,1630532,Franz Wagner,Franz,F,,1181,1,5,0.2,1,4.0,0.25,0,0,0,0,2,2,2,1,1,2,3,3,5.0
2,212112212421312966669,1610612753,ORL,Orlando,1628976,Wendell Carter Jr.,Wendell,C,,1276,3,4,0.75,1,2.0,0.5,3,4,0,2,6,8,3,0,0,3,3,10,-6.0
3,212112212421312966669,1610612753,ORL,Orlando,203914,Gary Harris,Gary,G,,1231,4,9,0.444,2,6.0,0.333,0,0,0,0,0,0,0,0,1,0,0,10,-3.0
4,212112212421312966669,1610612753,ORL,Orlando,1630591,Jalen Suggs,Jalen,G,,1264,3,11,0.273,2,5.0,0.4,1,2,0,0,4,4,3,0,2,2,4,9,-5.0
5,212112212421312966669,1610612753,ORL,Orlando,1630181,R.J. Hampton,R.J.,,,1338,3,7,0.429,0,0.0,0.0,1,2,0,1,5,6,3,1,0,0,1,7,8.0
6,212112212421312966669,1610612753,ORL,Orlando,1629021,Moritz Wagner,Moritz,,,1345,5,9,0.556,4,7.0,0.571,2,4,0,0,2,2,0,1,0,2,3,16,0.0
7,212112212421312966669,1610612753,ORL,Orlando,1630175,Cole Anthony,Cole,,,1305,6,12,0.5,4,7.0,0.571,0,0,0,1,5,6,6,0,0,3,1,16,10.0
8,212112212421312966669,1610612753,ORL,Orlando,1628964,Mo Bamba,Mo,,,1293,6,10,0.6,1,3.0,0.333,0,0,0,3,7,10,1,0,4,1,2,13,11.0
9,212112212421312966669,1610612753,ORL,Orlando,202734,E'Twaun Moore,E'Twaun,,,1158,0,4,0.0,0,1.0,0.0,0,0,0,2,3,5,3,2,0,1,1,0,13.0


In [116]:
stats.add_record(record)

2023-10-24 14:12:29,569 - INFO - This GAME_ID:212112212421312966669 is already in the dataset


In [117]:
# Новая статистика 
data = stats._calculate_rating()
data['MIN']

Unnamed: 0,player_id,values,rating
0,2544,1000000000,0
1,2546,13991,1
2,2617,13747,2
3,2730,13664,3
4,2738,13341,4
...,...,...,...
601,1630791,0,601
602,1630792,0,602
603,1630793,0,603
604,1630801,0,604


In [114]:
# Старая статистика

data = stats._calculate_rating()
data['MIN']

Unnamed: 0,player_id,values,rating
0,2544,1000000000,0
1,2546,13991,1
2,2617,13747,2
3,2730,13664,3
4,2738,13341,4
...,...,...,...
601,1630791,0,601
602,1630792,0,602
603,1630793,0,603
604,1630801,0,604
