In [1]:
##########################
#### standard library ####
##########################
import os
import sys
import time
import warnings
import random
from typing import List, Tuple, Dict, Any, Union, Optional, Callable
import shutil
# warnings.filterwarnings("ignore")

###################
#### 3rd party ####
###################
import torch
import torchaudio
import torchvision
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.optim import AdamW
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler, autocast #amp = automatic mixed precision
import lightning as L
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

######################
#### my own files ####
######################
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from utils.utils import print_name, print_shape
from rocket import Rocket, RocketFeatures
from ridge_loocv import fit_ridge_LOOCV

np.set_printoptions(precision=3, threshold=5) # Print options

# Config

In [2]:
class CFG:
    data_dir = "/home/nikita/Code/zephyrox/Data/Ford/"
    logs_dir = "/home/nikita/Code/zephyrox/Data/Ford/logs/"
    
    # Device and random seed
    device = 'cuda' # if torch.cuda.is_available() else 'cpu'
    seed = 42
    
    # Number of epochs, number of folds
    batch_size = 32
    epochs = 20
    n_folds = 5

    # Learning rate, optimizer, and cosine scheduler
    lr = 1e-1
    lr_min = 1e-3
    weight_decay = 1e-6
    gradient_clip_val = 1000.0
    optimizer = torch.optim.AdamW # AdamW, Adam

## 🌱 Seed Everything

In [3]:
def set_seed(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(CFG.seed)

## Ford Data

In [7]:
# def read_Ford():
#     df = 

df = pd.read_csv(CFG.data_dir + "fordTrain.csv")
data = df.to_numpy()

# df["path"] = CFG.data_dir + "/train_audio/" + df["filename"]
# df["rating"] = np.clip(df["rating"] / df["rating"].max(), 0.1, 1.0)

# skf = StratifiedKFold(n_splits=CFG.n_folds, random_state=CFG.seed, shuffle=True)
# df['fold'] = -1
# for ifold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=df["primary_label"].values)):
#     df.loc[val_idx, 'fold'] = ifold

# sub = pd.read_csv(f"{CFG.data_dir}/sample_submission.csv")
# target_columns = sub.columns.tolist()[1:]
# num_classes = len(target_columns)
# bird2id = {b: i for i, b in enumerate(target_columns)}

In [13]:
df

Unnamed: 0,TrialID,ObsNum,IsAlert,P1,P2,P3,P4,P5,P6,P7,...,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11
0,0,0,0,34.7406,9.84593,1400,42.8571,0.290601,572,104.8950,...,0.175,752,5.99375,0,2005,0,13.4,0,4,14.8004
1,0,1,0,34.4215,13.41120,1400,42.8571,0.290601,572,104.8950,...,0.455,752,5.99375,0,2007,0,13.4,0,4,14.7729
2,0,2,0,34.3447,15.18520,1400,42.8571,0.290601,576,104.1670,...,0.280,752,5.99375,0,2011,0,13.4,0,4,14.7736
3,0,3,0,34.3421,8.84696,1400,42.8571,0.290601,576,104.1670,...,0.070,752,5.99375,0,2015,0,13.4,0,4,14.7667
4,0,4,0,34.3322,14.69940,1400,42.8571,0.290601,576,104.1670,...,0.175,752,5.99375,0,2017,0,13.4,0,4,14.7757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604324,510,1194,1,32.0051,10.13240,800,75.0000,0.081731,680,88.2353,...,0.280,240,3.01875,0,1801,0,30.4,0,4,15.8113
604325,510,1195,1,32.0393,12.45040,800,75.0000,0.081731,680,88.2353,...,0.280,240,3.01875,0,1801,0,30.4,0,4,15.8018
604326,510,1196,1,32.0762,10.06180,800,75.0000,0.081731,680,88.2353,...,0.175,240,3.01875,0,1800,0,31.3,0,4,15.8120
604327,510,1197,1,32.1154,17.84500,800,75.0000,0.081731,680,88.2353,...,0.175,240,3.01875,0,1800,0,31.3,0,4,15.8270


In [33]:
trialIDs = df['TrialID'].unique()
for trialID in trialIDs:
    trial = df[df['TrialID'] == trialID]
    data = trial.iloc[:, 2:].to_numpy()
    label, data = data[:, 0], data[:, 1:]
    print(np.unique(label))

[0. 1.]
[0. 1.]
[0. 1.]
[1.]
[1.]
[1.]
[1.]
[0. 1.]
[0.]
[0.]
[0.]
[0. 1.]
[0.]
[0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0.]
[0.]
[0.]
[0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0.]
[0.]
[0. 1.]
[0.]
[0.]
[0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0. 1.]
[0.]
[0.]
[0.]
[0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0.]
[0.]
[0.]
[0. 1.]
[0.]
[0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0

In [None]:
#facedetection .... website is up again.

In [23]:
trial_counts = df['TrialID'].value_counts()
print(trial_counts)
print(trial_counts[trial_counts>1200])

TrialID
0      1211
329    1211
326    1211
325    1211
324    1211
       ... 
398    1191
379    1191
373    1187
494    1175
358    1166
Name: count, Length: 500, dtype: int64
TrialID
0      1211
329    1211
326    1211
325    1211
324    1211
       ... 
157    1201
202    1201
55     1201
443    1201
271    1201
Name: count, Length: 426, dtype: int64


In [8]:
data.shape

(604329, 33)

In [10]:
from sklearn.preprocessing import StandardScaler

def load_ford_data(file_path, norm=True, verbose=1):
    if verbose > 0:
        print("[Data_Loader] Loading data from {}".format(file_path))
    df = pd.read_csv(file_path)
    all_series = df.series.unique()
    data = []

    for series in all_series:
        if verbose > 0:
            print("[Data_Loader] Processing series {}".format(series))
        this_series = df.loc[df.series == series].reset_index(drop=True)
        series_labels = np.array(this_series.label)
        series_data = np.array(this_series.iloc[:, 3:])
        if norm:
            scaler = StandardScaler()
            series_data = scaler.fit_transform(series_data)
        data.append(pd.DataFrame({"data": [series_data],
                                  "label": [series_labels]}, index=[0]))
    data = pd.concat(data)
    data.reset_index(drop=True, inplace=True)

    return data

In [18]:
data = load_ford_data(CFG.data_dir + "fordTrain.csv")

[Data_Loader] Loading data from /home/nikita/Code/zephyrox/Data/Ford/fordTrain.csv


AttributeError: 'DataFrame' object has no attribute 'series'