In [1]:
import os, gc, sys, time, datetime, math, random,  psutil
import numpy as np,  matplotlib.pyplot as plt, pandas as pd
from   pathlib   import Path        
from   tqdm.auto import tqdm
import pyarrow, pyarrow.parquet as pq 


PATH = Path("/root/autodl-tmp/kaggle/icecube-neutrinos-in-deep-ice/")  # path to dataset
# files_test = [item for item in (PATH  / "test").glob('*')] 
files_test = ["/root/autodl-tmp/kaggle/icecube-neutrinos-in-deep-ice/train/batch_660.parquet"]
print(f"{len(files_test):3d} test files")

#===============================================================================
def info(text, pref="", end="\n"):
    """ Information about the progress of calculations (time and memory) """
    gc.collect()
    ram, t = psutil.virtual_memory().used / 1024**3,  time.time()    
    print(f"{pref}{(t-info.beg)/60:5.1f}m[{t-info.last:+5.1f}s] {ram:6.3f}Gb > {text}",end=end)
    info.last = time.time(); 
info.beg = info.last = time.time()

#-------------------------------------------------------------------------------

def get_sensors():
    """ Get sensor positions """    
    df = pd.read_csv(PATH / "sensor_geometry.csv")      
    df['line_id'] = df.sensor_id // 60 + 1                 # string id
    df['core']    = (df.line_id > 78).astype(np.float32)   # sensor from DeepCore
    df.x = ( df.x * 1e-3 ).astype(np.float32)              # distances in kilometers
    df.y = ( df.y * 1e-3 ).astype(np.float32)
    df.z = ( df.z * 1e-3 ).astype(np.float32)
    return df[['sensor_id', 'line_id', 'core', 'x', 'y', 'z']]

#-------------------------------------------------------------------------------

def prepare_batch(df, drop_aux = True, doms_agg = False, verbose=True):
    """ 
    Preparing a loaded batch, shifting and normalizing times 
        * drop_aux - whether it is necessary to throw out auxiliary pulses (auxiliary==True)
        * doms_agg - whether it is necessary to aggregate pulses on sensors
        * verbose  - debug output
    """    
    df['event_id'] = df.index.astype(np.int64)
    df = df.reset_index(drop=True)  # sensor_id, t, charge, aux, event_id    
    df.rename(columns={"time": "t", "auxiliary": "aux", 'charge': 'q'}, inplace=True)
    df.q = df.q.astype(np.float32)

    if drop_aux:
        df = df[ ~df.aux ]
    
    if doms_agg:
        df = df.groupby(['event_id', 'sensor_id']).agg(
            aux = ( 'aux', "mean"),
            q   = ( 'q',   "sum"),
            t   = ( 't',   "min"),)
        df = df.reset_index()    
        
    if verbose: info(f"prepare_batch: loaded  {df.shape}")
        
    times = df.groupby('event_id').agg( t_min = ('t', 'min') )
    df = df.merge(times, left_on='event_id', right_index=True, how='left')
    df.t = (( df.t - df.t_min ) * 0.299792458e-3 ).astype(np.float32)             
    
    if verbose: info("prepare_batch: shift_times")    

    return df[['event_id', 'sensor_id', 'aux', 'q', 't' ]]

#-------------------------------------------------------------------------------

def agg_features(df):
    """ Calculate some features aggregated for a given event """
    df = df.groupby('event_id').agg(
        lines = ( 'line_id',   'nunique'),
        doms  = ( 'sensor_id', 'nunique'),
        core  = ( 'core',      'mean'),
        pulses= ( 't',         'count'),
        t     = ( 't',         'mean'),
        q     = ( 'q',         'mean'),
    )
    return df
#-------------------------------------------------------------------------------

def angles2vector(df):
    """ Add unit vector components from (azimuth,zenith) to the DataFrame df """
    df['nx'] = np.sin(df.zenith) * np.cos(df.azimuth)
    df['ny'] = np.sin(df.zenith) * np.sin(df.azimuth)
    df['nz'] = np.cos(df.zenith) 
    return df

#-------------------------------------------------------------------------------

def vector2angles(n, eps=1e-8):
    """  Get spherical angles of vector n: (B,3) """                
    n = n / (np.linalg.norm(n, axis=1, keepdims=True) + eps)    
                                
    azimuth = np.arctan2( n[:,1],  n[:,0])    
    azimuth[azimuth < 0] += 2*np.pi
                                
    zenith = np.arccos( n[:,2].clip(-1,1) )                                
    
    return azimuth, zenith


  1 test files


In [2]:
BEST_FIT_VALUE = 5.4265625

def calculate_velocity(event, lm=BEST_FIT_VALUE, eps=1e-8):
    """ Weighted Line-fit method """
    df = event[event["aux"] == 0]
    w = np.exp(-lm * df.t)
    sum_w = np.sum(w)
    sum_xw = np.sum(df.x * w) / sum_w
    sum_yw = np.sum(df.y * w) / sum_w
    sum_zw = np.sum(df.z * w) / sum_w
    sum_tw = np.sum(df.t * w) / sum_w
    sum_xtw = np.sum(df.x * df.t * w) / sum_w
    sum_ytw = np.sum(df.y * df.t * w) / sum_w
    sum_ztw = np.sum(df.z * df.t * w) / sum_w
    sum_ttw = np.sum(df.t * df.t * w) / sum_w

    Dtw = sum_ttw - sum_tw * sum_tw
    ux = (sum_xtw - sum_xw * sum_tw) / (Dtw + eps)
    uy = (sum_ytw - sum_yw * sum_tw) / (Dtw + eps)
    uz = (sum_ztw - sum_zw * sum_tw) / (Dtw + eps)

    u = np.array([-ux, -uy, -uz], dtype=np.float32)
    print(u)
    u_scalar = np.linalg.norm(u)
    un = u / u_scalar

    azimuth = np.arctan2(u[1], u[0])
    if azimuth < 0:
        azimuth += 2 * np.pi
    # azimuth = azimuth / np.pi - 1
    # zenith = np.arccos(un[2]) / (0.5 * np.pi) - 1
    azimuth = azimuth
    zenith = np.arccos(un[2])
    return np.array([[azimuth, zenith, np.log10(u_scalar)]], dtype=np.float32)

In [3]:
# 1. load sensors
sensors_df = get_sensors()    
display(sensors_df.head(2))

# 2. load pulses from batches
pulses_df = None
for filename in files_test:
    df = pd.read_parquet(filename)    
    df = prepare_batch(df, drop_aux = True, doms_agg = False)
    df = df.merge(sensors_df, left_on='sensor_id', right_on='sensor_id', how="left")
    pulses_df = df if pulses_df is None else pulses_df.append(df)
display(pulses_df.head(2))
display(pulses_df.shape)

def line_fit(df, lm, eps = 1e-8):
    """ Weighted Line-fit method """
    df['w']   = np.exp(-lm * df.t)      
    
    df['xw']  = df.x*df.w;       df['yw']  = df.y*df.w;       df['zw']  = df.z*df.w;  
    df['xtw'] = df.x*df.t*df.w;  df['ytw'] = df.y*df.t*df.w;  df['ztw'] = df.z*df.t*df.w; 
    df['ttw'] = df.t*df.t*df.w;  df['tw']  = df.t*df.w; 
    agg = df.groupby(["event_id"]).agg(
        xw  = ('xw', 'sum'),  yw  = ('yw', 'sum'),  zw  = ('zw', 'sum'),  tw = ('tw', 'sum'),
        xtw = ('xtw','sum'),  ytw = ('ytw','sum'),  ztw = ('ztw','sum'),  ttw = ('ttw','sum'),
        w   = ('w',  'sum')
    )      
    agg.xw  /= agg.w;  agg.yw  /= agg.w; agg.zw  /= agg.w;  agg.tw  /= agg.w
    agg.xtw /= agg.w;  agg.ytw /= agg.w; agg.ztw /= agg.w;  agg.ttw /= agg.w
    
    agg['Dtw'] = agg.ttw - agg.tw*agg.tw
    agg['ux'] = ( agg.xtw - agg.xw*agg.tw ) / ( agg.Dtw + eps);  agg['qx'] = agg.xw - agg.ux * agg.tw
    agg['uy'] = ( agg.ytw - agg.yw*agg.tw ) / ( agg.Dtw + eps);  agg['qy'] = agg.yw - agg.uy * agg.tw
    agg['uz'] = ( agg.ztw - agg.zw*agg.tw ) / ( agg.Dtw + eps);  agg['qz'] = agg.zw - agg.uz * agg.tw
    return agg[['ux', 'uy', 'uz', 'qx', 'qy', 'qz' ]]

pred_df = line_fit(pulses_df, lm=5.4265625)
u = pred_df[['ux','uy','uz']].to_numpy()

df_to_submit = pd.DataFrame()
df_to_submit['event_id'] = pulses_df['event_id'].unique()
df_to_submit['azimuth'], df_to_submit['zenith'] = vector2angles(-u)
df_to_submit = df_to_submit.set_index('event_id')
df_to_submit.to_csv('submission.csv')


Unnamed: 0,sensor_id,line_id,core,x,y,z
0,0,1,0.0,-0.25614,-0.52108,0.49603
1,1,1,0.0,-0.25614,-0.52108,0.47901


  0.2m[+14.9s] 58.968Gb > prepare_batch: loaded  (17247276, 5)
  0.4m[ +6.7s] 67.349Gb > prepare_batch: shift_times


Unnamed: 0,event_id,sensor_id,aux,q,t,line_id,core,x,y,z
0,2144984130,3363,False,1.275,0.0,57,0.0,0.25731,0.21166,0.45336
1,2144984130,3364,False,1.625,0.005696,57,0.0,0.25731,0.21166,0.43634


(17247276, 10)

In [26]:
print(df_to_submit[:5])

           azimuth    zenith
event_id                    
24        1.684347  2.594189
41        1.577276  0.817996
59        4.712389  3.141247
67        3.824628  0.645509
72        6.157014  0.350222


In [36]:
id = 24
df = pulses_df[pulses_df["event_id"] == id]
calculate_velocity(df)

[-0.03906769  0.34257433 -0.56566554]


array([[ 1.6843473 ,  2.5941892 , -0.17883652]], dtype=float32)

In [35]:
pred_df = line_fit(df, lm=BEST_FIT_VALUE)
u = pred_df[['ux','uy','uz']].to_numpy()
print(vector2angles(-u))
print(u)


(array([4.712389], dtype=float32), array([3.1412473], dtype=float32))
[[0.000000e+00 3.402496e-07 1.193793e-01]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['w']   = np.exp(-lm * df.t)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['xw']  = df.x*df.w;       df['yw']  = df.y*df.w;       df['zw']  = df.z*df.w;
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['xtw'] = df.x*df.t*df.w;  df['ytw'] = df.y*df.t*df.w;  df['ztw'] = df.z*df.t*df.w;
A value