In [1]:
import os

import numpy as np
import pandas as pd
import pyspark.sql.functions as F
import regex as re

from IPython.display import display
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from typing import *

In [2]:
spark = SparkSession \
    .builder \
    .appName('group2nba') \
    .getOrCreate()

In [15]:
path_main = '/project/ds5559/group2nba'
path_data = 'NBA_PBP_2020-21'
data_type = 'test'

# train: 'NBA_PBP_201[0-8]-1[0-9]'
# valid: 'NBA_PBP_2019-20'
# test: 'NBA_PBP_2020-21'

T = TypeVar('T')

In [4]:
class DataField():
    __slots__ = [
          'DataType'
        , 'CleanFunction'
    ]
    
    def __init__(self, DataType: T, CleanFunction: F.udf):
        self.DataType = DataType
        self.CleanFunction = CleanFunction

In [5]:
@F.udf(StringType())
def clean_date(val: str) -> str:
    '''Cleans: datetime, no validation'''
    try:
        return val
    except:
        return None
    
    
@F.udf(StringType())
def clean_gametype(val: str) -> str:
    '''Cleans: GameType'''
    try:
        val = val.lower()
        return val if val in 'regular,playoff' else None
    except:
        return None
    
    
@F.udf(IntegerType())
def clean_int(val: int) -> int:
    '''Cleans: Integer values'''
    try:
        return val
    except:
        return None
    
    
@F.udf(IntegerType())
def clean_reboundtype(val: str) -> str:
    '''Cleans: ReboundType, converts to bit'''
    try:
        return int(val.lwoer() == 'offensive')
    except:
        return None
    

@F.udf(StringType())
def clean_shottype(val: str) -> str:
    '''Cleans: ShotType'''
    try:
        return handle_pattern(r'[23][-]pt (?:[a-z]+\s?)+', val)
    except:
        return None
    

@F.udf(IntegerType())
def clean_shotoutcome(val: str) -> int:
    '''Cleans: shot attempts, converts to bit'''
    try:
        return int(val.lower() == 'make')
    except:
        return None
    

@F.udf(StringType())
def clean_str(val: str) -> str:
    '''Cleans: generic string, no validation'''
    try:
        return val
    except:
        return None
    
    
@F.udf(StringType())
def clean_team(val: str) -> str:
    '''Cleans: team fields'''
    try:
        return val if re.match(r'[A-Z]{3}', val) else None
    except:
        return None

In [6]:
DATA_PATHES = {
      'train': 'clean_train_data'
    , 'valid': 'clean_valid_data'
    , 'test': 'clean_test_data'
}

In [7]:
FIELDS = {
      'Url': DataField(StringType, clean_str)
    , 'GameType': DataField(StringType, clean_gametype)
    , 'Location': DataField(StringType, clean_str)
    , 'Date': DataField(StringType, clean_date)
    , 'Time': DataField(StringType, clean_str)
    , 'WinningTeam': DataField(StringType, clean_team)
    , 'Quarter': DataField(IntegerType, clean_int)
    , 'SecLeft': DataField(IntegerType, clean_int)
    , 'AwayTeam': DataField(StringType, clean_team)
    , 'AwayPlay': DataField(StringType, clean_str)
    , 'AwayScore': DataField(IntegerType, clean_int)
    , 'HomeTeam': DataField(StringType, clean_team)
    , 'HomePlay': DataField(StringType, clean_str)
    , 'HomeScore': DataField(IntegerType, clean_int)
    , 'Shooter': DataField(StringType, clean_str)
    , 'ShotType': DataField(StringType, clean_shottype)
    , 'ShotOutcome': DataField(StringType, clean_shotoutcome)
    , 'ShotDist': DataField(IntegerType, clean_int)
    , 'Assister': DataField(StringType, clean_str)
    , 'Blocker': DataField(StringType, clean_str)
    , 'FoulType': DataField(StringType, clean_str)
    , 'Fouler': DataField(StringType, clean_str)
    , 'Fouled': DataField(StringType, clean_str)
    , 'Rebounder': DataField(StringType, clean_str)
    , 'ReboundType': DataField(StringType, clean_reboundtype)
    , 'ViolationPlayer': DataField(StringType, clean_str)
    , 'ViolationType': DataField(StringType, clean_str)
    , 'TimeoutTeam': DataField(StringType, clean_team)
    , 'FreeThrowShooter': DataField(StringType, clean_str)
    , 'FreeThrowOutcome': DataField(StringType, clean_shotoutcome)
    , 'FreeThrowNum': DataField(StringType, clean_str)
    , 'EnterGame': DataField(StringType, clean_str)
    , 'LeaveGame': DataField(StringType, clean_str)
    , 'TurnoverPlayer': DataField(StringType, clean_str)
    , 'TurnoverType': DataField(StringType, clean_str)
    , 'TurnoverCause': DataField(StringType, clean_str)
    , 'TurnoverCauser': DataField(StringType, clean_str)
    , 'JumpballAwayPlayer': DataField(StringType, clean_str)
    , 'JumpballHomePlayer': DataField(StringType, clean_str)
    , 'JumpballPoss': DataField(StringType, clean_str)
}

In [16]:
schema = StructType([StructField(k, v.DataType()) for k, v in FIELDS.items()])

df = spark.read \
    .format('csv') \
    .option('header', True) \
    .schema(schema) \
    .load(f'{path_main}/raw_data/{path_data}.csv')

display(df.count())
display(df.printSchema())
display(df.head(2))

97673

root
 |-- Url: string (nullable = true)
 |-- GameType: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- WinningTeam: string (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- SecLeft: integer (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- AwayPlay: string (nullable = true)
 |-- AwayScore: integer (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- HomePlay: string (nullable = true)
 |-- HomeScore: integer (nullable = true)
 |-- Shooter: string (nullable = true)
 |-- ShotType: string (nullable = true)
 |-- ShotOutcome: string (nullable = true)
 |-- ShotDist: integer (nullable = true)
 |-- Assister: string (nullable = true)
 |-- Blocker: string (nullable = true)
 |-- FoulType: string (nullable = true)
 |-- Fouler: string (nullable = true)
 |-- Fouled: string (nullable = true)
 |-- Rebounder: string (nullable = true)
 |-- ReboundType: string (nullable = true)

None

[Row(Url='/boxscores/202012220BRK.html', GameType='regular', Location='Barclays Center Brooklyn New York', Date='December 22 2020', Time='7:00 PM', WinningTeam='BRK', Quarter=1, SecLeft=720, AwayTeam='GSW', AwayPlay='Jump ball: J. Wiseman vs. D. Jordan (J. Harris gains possession)', AwayScore=0, HomeTeam='BRK', HomePlay=None, HomeScore=0, Shooter=None, ShotType=None, ShotOutcome=None, ShotDist=None, Assister=None, Blocker=None, FoulType=None, Fouler=None, Fouled=None, Rebounder=None, ReboundType=None, ViolationPlayer=None, ViolationType=None, TimeoutTeam=None, FreeThrowShooter=None, FreeThrowOutcome=None, FreeThrowNum=None, EnterGame=None, LeaveGame=None, TurnoverPlayer=None, TurnoverType=None, TurnoverCause=None, TurnoverCauser=None, JumpballAwayPlayer='J. Wiseman - wisemja01', JumpballHomePlayer='D. Jordan - jordade01', JumpballPoss='J. Harris - harrijo01'),
 Row(Url='/boxscores/202012220BRK.html', GameType='regular', Location='Barclays Center Brooklyn New York', Date='December 22 20

In [17]:
for k, v in FIELDS.items():
    df = df.withColumn(k, v.CleanFunction(k))
    
display(df.count())
display(df.printSchema())
display(df.head(2))

97673

root
 |-- Url: string (nullable = true)
 |-- GameType: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- WinningTeam: string (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- SecLeft: integer (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- AwayPlay: string (nullable = true)
 |-- AwayScore: integer (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- HomePlay: string (nullable = true)
 |-- HomeScore: integer (nullable = true)
 |-- Shooter: string (nullable = true)
 |-- ShotType: string (nullable = true)
 |-- ShotOutcome: integer (nullable = true)
 |-- ShotDist: integer (nullable = true)
 |-- Assister: string (nullable = true)
 |-- Blocker: string (nullable = true)
 |-- FoulType: string (nullable = true)
 |-- Fouler: string (nullable = true)
 |-- Fouled: string (nullable = true)
 |-- Rebounder: string (nullable = true)
 |-- ReboundType: integer (nullable = tru

None

[Row(Url='/boxscores/202012220BRK.html', GameType='regular', Location='Barclays Center Brooklyn New York', Date='December 22 2020', Time='7:00 PM', WinningTeam='BRK', Quarter=1, SecLeft=720, AwayTeam='GSW', AwayPlay='Jump ball: J. Wiseman vs. D. Jordan (J. Harris gains possession)', AwayScore=0, HomeTeam='BRK', HomePlay=None, HomeScore=0, Shooter=None, ShotType=None, ShotOutcome=None, ShotDist=None, Assister=None, Blocker=None, FoulType=None, Fouler=None, Fouled=None, Rebounder=None, ReboundType=None, ViolationPlayer=None, ViolationType=None, TimeoutTeam=None, FreeThrowShooter=None, FreeThrowOutcome=None, FreeThrowNum=None, EnterGame=None, LeaveGame=None, TurnoverPlayer=None, TurnoverType=None, TurnoverCause=None, TurnoverCauser=None, JumpballAwayPlayer='J. Wiseman - wisemja01', JumpballHomePlayer='D. Jordan - jordade01', JumpballPoss='J. Harris - harrijo01'),
 Row(Url='/boxscores/202012220BRK.html', GameType='regular', Location='Barclays Center Brooklyn New York', Date='December 22 20

In [18]:
path_write = f'{path_main}/{DATA_PATHES[str.lower(data_type)]}'

df.write.csv(f'{path_write}/clean_{len(os.listdir(path_write))}.csv')