# 興味
- どんなデータの形で入れるのが良いのか検討する
 - 全trainデータ
 - 全trainデータ、プレート・ねじ別

In [None]:
# ライブラリのimport
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px

import librosa
import librosa.display
import IPython

from sklearn.preprocessing import minmax_scale, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.metrics import roc_auc_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

import umap


In [None]:
@dataclass(frozen=True)
class DataPath:
    input_dir: str = '../input/hah-data-science-challenge'
    train_wav_dir: str = f'{input_dir}/train/train'
    test_wav_dir: str = f'{input_dir}/test/test'
    train_csv: str = f'{input_dir}/train.csv'
    test_csv: str = f'{input_dir}/test.csv'

data_path = DataPath()

In [None]:
df_train = pd.read_csv(data_path.train_csv)
df_test = pd.read_csv(data_path.test_csv)

df_train['file_path'] = data_path.train_wav_dir +'/' +df_train['ファイル']
df_test['file_path'] = data_path.test_wav_dir +'/'+ df_test['ファイル']

In [None]:
file_names = df_train['file_path'].tolist()

In [None]:
from typing import List

# フーリエ変換用のheplerクラスを作る。
class FourieTransformHelper:
    def __init__(self,  
                 file_names: List[str],
                 input_dir: str=None):
        if isinstance(file_names, list):
            self.file_names = file_names
        else:
            raise TypeError(f'file_names type is {type(file_names)} not list!')
        self.input_dir = input_dir
        self.spectra, self.frequencies = [], []
        
    def _get_spectrum(self, signal, sample_rate):
        # 参考 https://medium.com/analytics-vidhya/simplifying-audio-data-fft-stft-mfcc-for-machine-learning-and-deep-learning-443a2f962e0e
        fft = np.fft.fft(signal)
        spectrum = np.abs(fft)
        frequency = np.linspace(0, sample_rate, int(len(spectrum)/2))
        spectrum = spectrum[:int(len(spectrum)/2)]
        return spectrum, frequency
    
    def _get_filename(self, file_name):
            if self.input_dir:
                return f'{self.input_dir}/{file_name}'
            else:
                return file_name
    
    def get_spectra(self):
        for file_name in tqdm(self.file_names):
            file_path = self._get_filename(file_name)
            audio, sample_rate = librosa.load(file_path)
            # 適当な長さに切り取る。キリのいい数字にした。
            audio = audio[:40000]
            spectrum, frequency = self._get_spectrum(audio, sample_rate)
            self.spectra.append(spectrum)
            self.frequencies.append(frequency)   

In [None]:
helper = FourieTransformHelper(file_names)
helper.get_spectra()

In [None]:
spectra = minmax_scale(helper.spectra, axis=1)

In [None]:
df_frq = pd.DataFrame(spectra)

In [None]:
df = pd.concat([df_train.reset_index(drop=True), df_frq], axis=1)

# 全てのデータを可視化してみる

In [None]:
tsne_embed = TSNE().fit_transform(df.iloc[:, 8:])
umap_embed = umap.UMAP().fit_transform(df.iloc[:, 8:])

fig = px.scatter(x=tsne_embed[:,0],y=tsne_embed[:,1], 
                 color=df['Target'].astype(str).tolist(),
                hover_name=df['ファイル'].tolist())
fig.show()
fig = px.scatter(x=umap_embed[:,0],y=umap_embed[:,1], 
                 color=df['Target'].astype(str).tolist(),               
                 hover_name=df['ファイル'].tolist())
fig.show()

# ネジとプレートを分けて可視化してみる

## ネジ：大　プレート：大

In [None]:
df_screw_big_plate_big = df[(df['ねじ']=='大') & (df['プレート']=='大')]

tsne_embed = TSNE().fit_transform(df_screw_big_plate_big.iloc[:, 8:])
umap_embed = umap.UMAP().fit_transform(df_screw_big_plate_big.iloc[:, 8:])

fig = px.scatter(x=tsne_embed[:,0],y=tsne_embed[:,1], 
                 color=df_screw_big_plate_big['Target'].astype(str).tolist(),
                 hover_name=df_screw_big_plate_big['ファイル'].tolist())
fig.show()
fig = px.scatter(x=umap_embed[:,0],y=umap_embed[:,1], 
                 color=df_screw_big_plate_big['Target'].astype(str).tolist(),
                 hover_name=df_screw_big_plate_big['ファイル'].tolist())
fig.show()



## ネジ：小　プレート：大

In [None]:
df_screw_small_plate_big = df[(df['ねじ']=='小') & (df['プレート']=='大')]

tsne_embed = TSNE().fit_transform(df_screw_small_plate_big.iloc[:, 8:])
umap_embed = umap.UMAP().fit_transform(df_screw_small_plate_big.iloc[:, 8:])

fig = px.scatter(x=tsne_embed[:,0],y=tsne_embed[:,1], 
                 color=df_screw_small_plate_big['Target'].astype(str).tolist(),
                 hover_name=df_screw_small_plate_big['ファイル'].tolist())
fig.show()
fig = px.scatter(x=umap_embed[:,0],y=umap_embed[:,1], 
                 color=df_screw_small_plate_big['Target'].astype(str).tolist(),
                 hover_name=df_screw_small_plate_big['ファイル'].tolist())
fig.show()


## ネジ：大 プレート：小

In [None]:
df_screw_big_plate_small = df[(df['ねじ']=='大') & (df['プレート']=='小')]

tsne_embed = TSNE().fit_transform(df_screw_big_plate_small.iloc[:, 8:])
umap_embed = umap.UMAP().fit_transform(df_screw_big_plate_small.iloc[:, 8:])

fig = px.scatter(x=tsne_embed[:,0],y=tsne_embed[:,1], 
                 color=df_screw_big_plate_small['Target'].astype(str).tolist(),
                 hover_name=df_screw_big_plate_small['ファイル'].tolist())
fig.show()
fig = px.scatter(x=umap_embed[:,0],y=umap_embed[:,1], 
                 color=df_screw_big_plate_small['Target'].astype(str).tolist(),
                 hover_name=df_screw_big_plate_small['ファイル'].tolist())
fig.show()

