# Data analysis 

- Duration of samples and differences
- Plot the mean square energy over time
- Zero-Crossing Rate (ZCR) = Rate the signal changes from pos to neg
- Silence Ratio -> The ratio of pauses in the audio through silence 

In [5]:
import sys
import os
from itables import show
from Backend.app import app

root = os.path.abspath('../Backend/app')

if root not in sys.path:
    sys.path.insert(0, root)


sys.path.insert(0, root)

t = app.get_i_table()
show(t)
app.main()


0
Loading ITables v2.7.0 from the init_notebook_mode cell...  (need help?)


Data Directory Contents:
------------------------
ASVspoof2019_LA_asv_protocols
ASVspoof2019_LA_asv_scores
ASVspoof2019_LA_cm_protocols
ASVspoof2019_LA_dev
ASVspoof2019_LA_eval
ASVspoof2019_LA_train
README.LA.txt
------------------------
Data Files:
------------------------
LA_D_1000265.flac
LA_D_1000752.flac
LA_D_1001095.flac
LA_D_1002130.flac
LA_D_1002200.flac
LA_D_1002318.flac
LA_D_1002626.flac
LA_D_1002910.flac
LA_D_1003356.flac
LA_D_1003673.flac
LA_D_1003797.flac
LA_D_1004357.flac
LA_D_1004406.flac
LA_D_1004678.flac
LA_D_1004774.flac
LA_D_1005471.flac
LA_D_1005592.flac
LA_D_1005971.flac
LA_D_1006568.flac
LA_D_1006586.flac
LA_D_1006756.flac
LA_D_1006800.flac
LA_D_1007033.flac
LA_D_1007830.flac
LA_D_1007975.flac
LA_D_1008673.flac
LA_D_1008730.flac
LA_D_1008834.flac
LA_D_1009810.flac
LA_D_1010123.flac
LA_D_1010205.flac
LA_D_1010295.flac
LA_D_1011190.flac
LA_D_1011897.flac
LA_D_1012014.flac
LA_D_1012227.flac
LA_D_1012999.flac
LA_D_1013553.flac
LA_D_1013596.flac
LA_D_1013783.flac
LA_D_

In [2]:
# Data Anlaysis - sample duration 

import numpy as np 
import pandas as pd
import soundfile as sf
import os
import librosa
from pathlib import Path
from itables import init_notebook_mode, show


project_path = os.path.abspath(os.sep) + 'Users/Luis/Desktop/LA/LA/'
train_data_path = project_path + 'ASVspoof2019_LA_train/flac'
train_protocol_path = project_path + 'ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt'

print(train_protocol_path)


train_df = pd.read_csv(train_protocol_path, sep = ' ', header = None, names = ['speaker_id', 'file_name', 'system_id', 'attack_type','key'])
print(f'Found {len(train_df)} files in protocol.')

results = []
train_bon_samples = train_df[train_df['key'] == 'bonafide']['file_name'].iloc[:len(train_df)].tolist()
train_spoofed_samples = train_df[train_df['key'] == 'spoofed']['file_name'].iloc[:len(train_df)].tolist()
train_samples = train_bon_samples + train_spoofed_samples

for filename in train_samples[:10]:
    try:
        file_path = os.path.join(train_data_path, filename + '.flac')
        y, sr = sf.read(file_path, dtype = 'float32')

        if y.ndim > 1:
            y = y.mean(axis=1)
            
        if sr != 16000:
            y = librosa.resample(y, orig_sr = sr, target_sr = 16000)
            sr = 16000

        key = train_df[train_df['file_name'] == filename]['key'].values[0]
        attack = train_df[train_df['file_name'] == filename]['attack_type'].values[0]

        results.append({
            'filename': filename,
            'label': key,
            'attack': attack,
            'duration': len(y)/sr,
            'mean': y.mean(),
            'std': y.std(),
            'max': y.max(),
            'min': y.min()
        })

    except Exception as e:
        print(f"{filename}: {e}")

if results: 
    results_df = pd.DataFrame(results)
    print('\nSummary')
    print(results_df)

C:\Users/Luis/Desktop/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt
Found 25380 files in protocol.

Summary
       filename     label attack  duration          mean       std       max  \
0  LA_T_1138215  bonafide      -  3.458063  2.995002e-07  0.096481  0.974030   
1  LA_T_1271820  bonafide      -  4.395187 -7.316615e-07  0.096049  0.903534   
2  LA_T_1272637  bonafide      -  2.899500 -5.499374e-07  0.075203  0.817200   
3  LA_T_1276960  bonafide      -  2.812562  4.007886e-07  0.104142  0.999969   
4  LA_T_1341447  bonafide      -  3.533187 -1.041347e-06  0.112211  0.999969   
5  LA_T_1363611  bonafide      -  2.257250 -7.326044e-07  0.109288  0.999969   
6  LA_T_1596451  bonafide      -  2.125563 -5.716045e-07  0.079535  0.575104   
7  LA_T_1608170  bonafide      -  2.003437  4.274651e-07  0.072892  0.899963   
8  LA_T_1684951  bonafide      -  3.791688 -8.833281e-07  0.102231  0.970306   
9  LA_T_1699801  bonafide      -  3.240875  5.614566e-07  0.084541  0.

In [3]:
import soundfile as sf
data, sr = sf.read(file_path)
print("Loaded with soundfile!")

Loaded with soundfile!


In [4]:
list_of_samples = []
for file in os.listdir(train_data_path):

    if file.endswith('.flac') and os.path.isfile(os.path.join(train_data_path,file)):
        file_path = os.path.join(train_data_path, file)
        
        with open(file_path, 'rb') as f:
            data = f.read()
        list_of_samples.append((data, file))
    else:
        print(f'Skipping {file}, is not a .flac file.')


dev_data_folder = 'Desktop/LA/LA/ASVspoof2019_LA_dev'
eval_data_folder = 'Desktop/LA/LA/ASVspoof2019_LA_eval'
save_folder = 'Griffith/Vishing_project/Notebook'
list_of_samples = []