In [115]:
import os
from typing import List

import pandas as pd
import numpy as np
import sqlite3
from pyod.models.iforest import IForest, check_array
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.ensemble import IsolationForest
import shap

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Utils


In [117]:
SEED = 42

def cast_to_0_1(preds):
    """
    from  -1 for outlies and 1 for inliers
    to 0 for inliers and 1 for outliers)
    """
    return (preds == -1).astype(int)

def explain_outlier(shap_value, columns, top_k=5):
    """
    Get TOP abnormal features
    """
    _vals = shap_value.values
    top_5 = np.argsort(_vals)[:top_k]
    return ({columns[idx]: _vals[idx] for idx in top_5})

# Get  Data

In [131]:
class  AnomalyСalculation:
    '''
    Рассчёт аномалий по прошедшим играм
    '''
    
    def __init__(self):
        # для рассчёта аномалий
        self.calculus_col = ['min_sec', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 
                             'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']
        # для доп контекста
        self.labels_col =  ['GAME_ID', 'TEAM_ID',	'TEAM_ABBREVIATION',	
                            'TEAM_CITY',	'PLAYER_ID',	'PLAYER_NAME',	'NICKNAME']
    
    
    def get_game_ids(self):
        pass 
    
    def get_data(self, limit=10000000000, db_path: str=r'../../data/basnya.db'):
        """
        Получить данные из локаьной SQLite
        
        Аргументы:
            db_path - путь до локальной БД
            limit - указывает, солько последних записей взять
        Возвращает:
            df - данные из бд в формате DataFrame
        """

        conn = sqlite3.connect(db_path)
        
        game2date = (
            pd.read_sql_query("SELECT * FROM GAMES", conn)
            .set_index('GAME_ID')['GAME_DATE_EST']
            ).to_dict()
        
        df = (
            pd.read_sql_query(f"SELECT * FROM boxscoretraditionalv2_0 LIMIT {limit}", conn)
            .drop('index', axis=1)
        )
        df[['_min', '_sec']] = df['MIN'].str.split(':', expand=True).fillna(0)
        df['min_sec'] = df._min.astype(float) + df._sec.astype(int) / 60 
        df['GAME_DATE'] = pd.to_datetime(df['GAME_ID'].map(game2date)).fillna(pd.to_datetime('1900-01-01'))
        return df.fillna(0)
    
    def get_anomalous_records(self, date: str) -> pd.DataFrame:
        """
        Получить аномальные значения
        
        Аргументы:
            date - дата (пример: 2021-11-10), ДО которой будет обучаться IF для того, чтоб определить аномалию для игр ПОСЛЕ date
        
        Возвращает: DataFrame с аномальными значениями
        """
        
        df = self.get_data()
        
        if date == None:
            df_train, df_test, _, _= train_test_split(df, df, test_size=0.2, random_state=SEED)


        df_train = df.loc[df['GAME_DATE'] <= pd.to_datetime(date)]
        df_test = df.loc[df['GAME_DATE'] > pd.to_datetime(date)]
        
        scaler = StandardScaler()
        
        X_train = scaler.fit_transform(df_train[self.calculus_col])
        X_test = scaler.transform(df_test[self.calculus_col])
        
        # create and fit IsolationForest model
        CONTAMINATION = 0.01
        clf =  IsolationForest(contamination=CONTAMINATION, random_state=SEED)
        clf.fit(X_train)
        _preds = cast_to_0_1(clf.predict(X_test)) # if anomaly:1 else:0
        
        # explanation of anomalies for isolation forest
        explainer = shap.TreeExplainer(clf, feature_names=self.calculus_col)
        # get shape_values for each example X_test 
        shap_values = explainer(X_test) 
        
        anomaly_with_shape = []
        
        for example in shap_values[_preds==1][:10]:
            anomaly_with_shape.append(explain_outlier(example, columns=self.calculus_col))
            
        return anomaly_with_shape
        
        

        

In [139]:

df_test[_preds==1][:10]

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,_min,_sec,min_sec,GAME_DATE
2135,12100057,1610612743,DEN,Denver,1630210,Markus Howard,Markus,0,0,30.000000:55,11.0,22.0,0.5,9.0,17.0,0.529,0.0,0.0,0.0,1.0,4.0,5.0,1.0,1.0,0.0,1.0,0.0,31.0,19.0,12100057,30.0,55,30.916667,2021-10-14
2476,12100066,1610612744,GSW,Golden State,201939,Stephen Curry,Stephen,G,0,30.000000:11,13.0,23.0,0.565,7.0,14.0,0.5,8.0,10.0,0.8,0.0,9.0,9.0,2.0,2.0,0.0,4.0,2.0,41.0,22.0,12100066,30.0,11,30.183333,2021-10-15
2601,22100005,1610612738,BOS,Boston,1629057,Robert Williams III,Robert,C,0,44.000000:48,5.0,5.0,1.0,0.0,0.0,0.0,6.0,8.0,0.75,3.0,7.0,10.0,3.0,3.0,5.0,0.0,1.0,16.0,11.0,22100005,44.0,48,44.8,2021-10-20
2602,22100005,1610612738,BOS,Boston,1627759,Jaylen Brown,Jaylen,G,0,45.000000:57,16.0,30.0,0.533,8.0,14.0,0.571,6.0,8.0,0.75,0.0,9.0,9.0,6.0,3.0,1.0,4.0,5.0,46.0,16.0,22100005,45.0,57,45.95,2021-10-20
2615,22100005,1610612752,NYK,New York,203944,Julius Randle,Julius,F,0,45.000000:59,12.0,27.0,0.444,3.0,8.0,0.375,8.0,8.0,1.0,0.0,8.0,8.0,9.0,0.0,3.0,7.0,4.0,35.0,10.0,22100005,45.0,59,45.983333,2021-10-20
2617,22100005,1610612752,NYK,New York,203095,Evan Fournier,Evan,G,0,44.000000:26,13.0,25.0,0.52,6.0,13.0,0.462,0.0,0.0,0.0,0.0,6.0,6.0,3.0,4.0,1.0,1.0,4.0,32.0,11.0,22100005,44.0,26,44.433333,2021-10-20
2827,22100013,1610612758,SAC,Sacramento,203084,Harrison Barnes,Harrison,F,0,36.000000:54,10.0,19.0,0.526,8.0,11.0,0.727,8.0,9.0,0.889,2.0,7.0,9.0,2.0,2.0,0.0,0.0,0.0,36.0,6.0,22100013,36.0,54,36.9,2021-10-20
2845,22100013,1610612757,POR,Portland,203468,CJ McCollum,CJ,G,0,35.000000:12,14.0,24.0,0.583,6.0,11.0,0.545,0.0,0.0,0.0,2.0,4.0,6.0,5.0,3.0,1.0,5.0,4.0,34.0,0.0,22100013,35.0,12,35.2,2021-10-20
2928,22100016,1610612744,GSW,Golden State,201939,Stephen Curry,Stephen,G,0,37.000000:33,16.0,25.0,0.64,8.0,13.0,0.615,5.0,5.0,1.0,0.0,10.0,10.0,1.0,1.0,1.0,6.0,2.0,45.0,-2.0,22100016,37.0,33,37.55,2021-10-21
2994,22100019,1610612754,IND,Indiana,1626167,Myles Turner,Myles,F,0,43.000000:36,15.0,22.0,0.682,5.0,9.0,0.556,5.0,8.0,0.625,5.0,5.0,10.0,0.0,1.0,3.0,1.0,3.0,40.0,3.0,22100019,43.0,36,43.6,2021-10-22


In [132]:
# ------------------- TESTING -------------------------------
anomaly = AnomalyСalculation()
anomaly_records = anomaly.get_anomalous_records('2021-10-10')
anomaly_records 

[{'FG3M': -1.2603159403805317,
  'FG3A': -1.2392756779069771,
  'FGA': -0.8698809703459265,
  'FGM': -0.7540991175091406,
  'PTS': -0.7273360235039464},
 {'FG3M': -1.0183068931302433,
  'FG3A': -0.8316612084503043,
  'PTS': -0.7513910825956549,
  'FGM': -0.6424492650283425,
  'FGA': -0.560458106046341},
 {'BLK': -1.6297115780368925,
  'FTA': -0.9235368574167693,
  'STL': -0.8826940032221455,
  'FTM': -0.739388425227038,
  'min_sec': -0.56178943550655},
 {'FG3M': -0.8463469092531447,
  'FG3A': -0.7677589689297922,
  'FGA': -0.7186186304830892,
  'PTS': -0.6618821296345767,
  'STL': -0.6589226759616778},
 {'TO': -0.9097560848373123,
  'BLK': -0.7475113106248705,
  'AST': -0.7026448131620812,
  'FGA': -0.652456765614274,
  'FTM': -0.5597121684257704},
 {'STL': -1.2848720879238154,
  'FG3A': -0.984719049954354,
  'FG3M': -0.9264182076567896,
  'FGM': -0.744959241662056,
  'FGA': -0.6909814197671675},
 {'FG3M': -1.1001076146766708,
  'PTS': -0.7795353425348809,
  'FTA': -0.6764689479814165,