In [1]:
import os

import pandas as pd
import numpy as np
import time
import sqlite3
from pyod.models.iforest import IForest, check_array
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Get  Data

In [7]:
class  AnomalyСalculation:
    '''
    Рассчёт аномалий по прошедшим играм
    '''
    
    def __init__(self):
        # столбцы, по которым рассчитываем аномалии
        self.calculus_col = ['min_sec', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 
                             'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']
    
    def get_data(self, limit=2000, db_path: str=r'../../data/basnya.db'):
        """
        Получить данные из локаьной SQLite
        
        Аргументы:
            db_path - путь до локальной БД
            limit - указывает, солько последних записей взять
        Возвращает:
            df - данные из бд в формате DataFrame
        """

        conn = sqlite3.connect(db_path)
        df = (
            pd.read_sql_query(f"SELECT * FROM boxscoretraditionalv2_0 LIMIT {limit}", conn)
            .drop('index', axis=1)
        )
        df[['_min', '_sec']] = df['MIN'].str.split(':', expand=True).fillna(0)
        df['min_sec'] = df._min.astype(float) + df._sec.astype(int) / 60 
        return df.fillna(0)
    
    def get_anomalous_records(self) -> pd.DataFrame:
        """
        Получить аномальные значения
        
        Возвращает: DataFrame с аномальными значениями
        """
        df = self.get_data(limit=1000)
        df_train = StandardScaler().fit_transform(df[self.calculus_col])
        # df_train.info()
        check_array(df_train)
        CONTAMINATION = 0.1
        clfs = [IForest(contamination=CONTAMINATION),
                OCSVM(contamination=CONTAMINATION),
                LOF(contamination=CONTAMINATION)]
        preds = {}
        cnts = {}
        scores = {}
        print('Anomaly searching ... ')
        for clf in tqdm(clfs):
            clf.fit(df_train)
            _key = str(clf).split('(')[0]
            _preds = clf.labels_  # binary labels (0: inliers, 1: outliers)
            preds[_key] = _preds
            scores[_key] = clf.decision_scores_  # raw outlier scores
            cnts[_key] = np.sum(_preds)
        
        # get lines that are anomalous for all models
        voted_preds = None
        for _preds in preds.values():
            if voted_preds is None:
                voted_preds = _preds
            else:
                voted_preds = ((voted_preds == 1) & (_preds == 1)).astype(int)
        
        df_rep = df.loc[voted_preds == 1].copy()
        
        return df_rep
        
                
        
    
    



In [5]:
df = anomaly.get_data()
df

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,_min,_sec,min_sec
0,12100001,1610612751,BKN,Brooklyn,1627761,DeAndre' Bembry,DeAndre',F,0,24.000000:05,2.0,3.0,0.667,0.0,1.0,0.0,4.0,8.0,0.5,2.0,3.0,5.0,2.0,0.0,0.0,0.0,1.0,8.0,8.0,0012100001,24.000000,05,24.083333
1,12100001,1610612751,BKN,Brooklyn,200794,Paul Millsap,Paul,F,0,18.000000:27,4.0,10.0,0.400,2.0,4.0,0.5,0.0,0.0,0.0,4.0,6.0,10.0,3.0,1.0,2.0,2.0,2.0,10.0,13.0,0012100001,18.000000,27,18.450000
2,12100001,1610612751,BKN,Brooklyn,200746,LaMarcus Aldridge,LaMarcus,C,0,14.000000:43,2.0,6.0,0.333,0.0,2.0,0.0,2.0,2.0,1.0,0.0,4.0,4.0,3.0,0.0,1.0,0.0,3.0,6.0,7.0,0012100001,14.000000,43,14.716667
3,12100001,1610612751,BKN,Brooklyn,1628971,Bruce Brown,Bruce,G,0,25.000000:02,5.0,9.0,0.556,2.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,1.0,0.0,0.0,5.0,12.0,3.0,0012100001,25.000000,02,25.033333
4,12100001,1610612751,BKN,Brooklyn,1628975,Jevon Carter,Jevon,G,0,25.000000:37,3.0,9.0,0.333,2.0,4.0,0.5,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,3.0,4.0,8.0,5.0,0012100001,25.000000,37,25.616667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89478,2042000301,1612709909,DEL,Delaware,1630296,Braxton Key,Braxton,0,0,16.000000:24,1.0,5.0,0.200,0.0,2.0,0.0,1.0,2.0,0.5,4.0,4.0,8.0,0.0,0.0,0.0,3.0,2.0,4.0,4.0,2042000301,16.000000,24,16.400000
89479,2042000301,1612709909,DEL,Delaware,1629648,Jordan Bone,Jordan,0,DNP - Coach's Decision,0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2042000301,0,0,0.000000
89480,2042000301,1612709909,DEL,Delaware,1628876,Jared Brownridge,Jared,0,DNP - Coach's Decision,0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2042000301,0,0,0.000000
89481,2042000301,1612709909,DEL,Delaware,1630239,Lamine Diane,Lamine,0,DNP - Coach's Decision,0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2042000301,0,0,0.000000


In [8]:
anomaly = AnomalyСalculation()
records = anomaly.get_anomalous_records()

DatabaseError: Execution failed on sql 'SELECT * FROM boxscoretraditionalv2_0 LIMIT {}': unrecognized token: "{"

In [4]:
records

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,_min,_sec,min_sec
40,12100002,1610612755,PHI,Philadelphia,203083,Andre Drummond,Andre,C,0,24.000000:22,6.0,9.0,0.667,0.0,0.0,0.000,7.0,8.0,0.875,5.0,9.0,14.0,3.0,0.0,4.0,5.0,1.0,19.0,-4.0,0012100002,24.000000,22,24.366667
277,12100008,1610612744,GSW,Golden State,1629673,Jordan Poole,Jordan,G,0,22.000000:19,10.0,17.0,0.588,7.0,13.0,0.538,3.0,3.0,1.000,0.0,5.0,5.0,5.0,1.0,2.0,0.0,1.0,30.0,21.0,0012100008,22.000000,19,22.316667
430,12100012,1610612739,CLE,Cleveland,1629012,Collin Sexton,Collin,G,0,23.000000:41,3.0,9.0,0.333,2.0,3.0,0.667,6.0,8.0,0.750,0.0,1.0,1.0,1.0,2.0,0.0,5.0,0.0,14.0,-21.0,0012100012,23.000000,41,23.683333
598,12100016,1610612737,ATL,Atlanta,1629629,Cam Reddish,Cam,0,0,27.000000:34,7.0,17.0,0.412,4.0,8.0,0.500,2.0,2.0,1.000,0.0,3.0,3.0,0.0,5.0,0.0,1.0,4.0,20.0,-15.0,0012100016,27.000000,34,27.566667
809,12100022,1610612763,MEM,Memphis,203500,Steven Adams,Steven,C,0,23.000000:44,5.0,7.0,0.714,0.0,0.0,0.000,5.0,6.0,0.833,8.0,8.0,16.0,3.0,0.0,1.0,6.0,3.0,15.0,23.0,0012100022,23.000000,44,23.733333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89390,2042000131,1612709890,AUS,Austin,1629683,Quinndary Weatherspoon,Quinndary,G,0,31.000000:50,11.0,15.0,0.733,2.0,6.0,0.333,1.0,1.0,1.000,0.0,4.0,4.0,2.0,3.0,0.0,8.0,3.0,25.0,-4.0,2042000131,31.000000,50,31.833333
89414,2042000201,1612709909,DEL,Delaware,1630194,Paul Reed,Paul,C,0,36.000000:30,10.0,18.0,0.556,3.0,5.0,0.600,2.0,2.0,1.000,5.0,5.0,10.0,5.0,2.0,3.0,2.0,6.0,26.0,23.0,2042000201,36.000000,30,36.500000
89415,2042000201,1612709909,DEL,Delaware,1630198,Isaiah Joe,Isaiah,G,0,37.000000:52,7.0,17.0,0.412,5.0,14.0,0.357,3.0,3.0,1.000,1.0,4.0,5.0,3.0,3.0,0.0,2.0,0.0,24.0,22.0,2042000201,37.000000,52,37.866667
89431,2042000201,1612709920,RAP,Raptors,1627780,Gary Payton II,Gary,0,0,32.000000:18,7.0,10.0,0.700,1.0,1.0,1.000,1.0,1.0,1.000,2.0,3.0,5.0,3.0,5.0,0.0,4.0,4.0,17.0,-28.0,2042000201,32.000000,18,32.300000


In [3]:
conn = sqlite3.connect(r'../../data/basnya.db')
d = pd.read_sql_query("SELECT GAME_ID, GAME_DATE_EST FROM GAMES", conn)
       

ValueError: orient 'values' not understood

# IForest

In [21]:


# получим соответствие GAME_ID и DATE, чтобы указать дату матча

# game_id_to_date = season.set_index('GAME_ID').GAME_DATE.to_dict()
# df_rep['GAME_DATE'] = df_rep.GAME_ID.map(game_id_to_date)
# df_rep
df_rep

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,_min,_sec,min_sec
40,12100002,1610612755,PHI,Philadelphia,203083,Andre Drummond,Andre,C,0,24.000000:22,6.0,9.0,0.667,0.0,0.0,0.000,7.0,8.0,0.875,5.0,9.0,14.0,3.0,0.0,4.0,5.0,1.0,19.0,-4.0,0012100002,24.000000,22,24.366667
277,12100008,1610612744,GSW,Golden State,1629673,Jordan Poole,Jordan,G,0,22.000000:19,10.0,17.0,0.588,7.0,13.0,0.538,3.0,3.0,1.000,0.0,5.0,5.0,5.0,1.0,2.0,0.0,1.0,30.0,21.0,0012100008,22.000000,19,22.316667
430,12100012,1610612739,CLE,Cleveland,1629012,Collin Sexton,Collin,G,0,23.000000:41,3.0,9.0,0.333,2.0,3.0,0.667,6.0,8.0,0.750,0.0,1.0,1.0,1.0,2.0,0.0,5.0,0.0,14.0,-21.0,0012100012,23.000000,41,23.683333
449,12100012,1610612741,CHI,Chicago,203897,Zach LaVine,Zach,G,0,24.000000:13,9.0,14.0,0.643,4.0,6.0,0.667,3.0,3.0,1.000,0.0,3.0,3.0,3.0,3.0,0.0,1.0,1.0,25.0,45.0,0012100012,24.000000,13,24.216667
598,12100016,1610612737,ATL,Atlanta,1629629,Cam Reddish,Cam,0,0,27.000000:34,7.0,17.0,0.412,4.0,8.0,0.500,2.0,2.0,1.000,0.0,3.0,3.0,0.0,5.0,0.0,1.0,4.0,20.0,-15.0,0012100016,27.000000,34,27.566667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89390,2042000131,1612709890,AUS,Austin,1629683,Quinndary Weatherspoon,Quinndary,G,0,31.000000:50,11.0,15.0,0.733,2.0,6.0,0.333,1.0,1.0,1.000,0.0,4.0,4.0,2.0,3.0,0.0,8.0,3.0,25.0,-4.0,2042000131,31.000000,50,31.833333
89414,2042000201,1612709909,DEL,Delaware,1630194,Paul Reed,Paul,C,0,36.000000:30,10.0,18.0,0.556,3.0,5.0,0.600,2.0,2.0,1.000,5.0,5.0,10.0,5.0,2.0,3.0,2.0,6.0,26.0,23.0,2042000201,36.000000,30,36.500000
89415,2042000201,1612709909,DEL,Delaware,1630198,Isaiah Joe,Isaiah,G,0,37.000000:52,7.0,17.0,0.412,5.0,14.0,0.357,3.0,3.0,1.000,1.0,4.0,5.0,3.0,3.0,0.0,2.0,0.0,24.0,22.0,2042000201,37.000000,52,37.866667
89431,2042000201,1612709920,RAP,Raptors,1627780,Gary Payton II,Gary,0,0,32.000000:18,7.0,10.0,0.700,1.0,1.0,1.000,1.0,1.0,1.000,2.0,3.0,5.0,3.0,5.0,0.0,4.0,4.0,17.0,-28.0,2042000201,32.000000,18,32.300000
