In [1]:
pip install audiomentations

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import os
import random
from collections import defaultdict
from tqdm import tqdm

# model


from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder, StandardScaler, MinMaxScaler, Normalizer
from sklearn.metrics import log_loss, f1_score, roc_curve, accuracy_score, roc_auc_score, precision_recall_curve, plot_roc_curve, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import linear_model 

import tensorflow as tf

from tensorflow.keras.layers import Input, Dense, Dropout, Conv2D, MaxPooling2D, BatchNormalization, Flatten, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Trim, Gain, PolarityInversion, SpecCompose, SpecChannelShuffle, SpecFrequencyMask

from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimentalex.set_memory_growth(physical_devices[0], True)
except:
    pass

In [4]:
train_df = pd.read_csv('../input/covid19/open/train_data.csv')
test_df = pd.read_csv('../input/covid19/open/test_data.csv')
unlabeled_df = pd.read_csv('../input/covid19/unlabeled_data.csv')

In [5]:
train_df.head()

In [6]:
train_df.shape

In [7]:
test_df.head()

In [8]:
test_df.shape

In [9]:
unlabeled_df.head()

In [10]:
unlabeled_df.shape

In [11]:
CFG = {
    'SR' : 16000,
    'N_MFCC' : 15, # default 15
    'SEED' : 41
}

"""augment = Compose([
    TimeStretch(min_rate=0.7, max_rate=1.4, p=0.9),
    PitchShift(min_semitones=-2, max_semitones=4, p=1),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.8),
    Trim(p=1),Gain(p=1),
    PolarityInversion(p=0.8)
])"""


augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])


"""augment = SpecCompose(
    [
        SpecChannelShuffle(p=0.5),
        SpecFrequencyMask(p=0.5),
    ]
)"""

In [None]:
def get_pre_features(df, wav, save_path):
    zip_path = '../input/covid19/open'
    
    if os.path.exists(save_path):
        return print(f'{save_path} is exist.')
    
    hop_length = CFG['SR'] * 0.01
    n_fft = CFG['SR'] * 0.025
    
    y_mean_ddfeatures = []
    y_mean_features = []
    y_std_features = []
    y_var_features = []
    y_max_features = []
    y_min_features = []
    extracts = defaultdict(list)
    
    for uid in tqdm(df.id):
        train_path = os.path.join(zip_path, wav)
        wav_path = os.path.join(train_path, str(uid).zfill(5) + '.wav')
        
        y, sr = librosa.load(wav_path, sr = CFG['SR']) # res_type = 'kaiser_fast', mono=True, duration=5
        
        #y = augment(y, sample_rate = CFG['SR'])
        #y = augment(y)
        
        #mean = y.mean()
        #std = y.std()
        #y = (y - mean) / std
        
        #y = np.log1p(y)
        
        chroma_stft = librosa.feature.chroma_stft(y = y, sr = sr)
        chroma_cqt = librosa.feature.chroma_cqt(y = y, sr = sr)
        chroma_cens = librosa.feature.chroma_cens(y = y, sr = sr)
        rmse = librosa.feature.rms(y = y) # 잡음 진폭 피처
        spec_cent = librosa.feature.spectral_centroid(y = y, sr = sr)
        spec_bw = librosa.feature.spectral_bandwidth(y = y, sr = sr)
        spec_ct = librosa.feature.spectral_contrast(y = y, sr = sr)
        spec_fn = librosa.feature.spectral_flatness(y = y)
        rolloff = librosa.feature.spectral_rolloff(y = y, sr = sr)
        poly = librosa.feature.poly_features(y = y, sr = sr)
        zcr = librosa.feature.zero_crossing_rate(y = y)
        
        S = librosa.feature.melspectrogram(y, sr = sr, n_mels = 128)
        log_S = librosa.power_to_db(S, ref = np.max)
        #log_S = librosa.amplitude_to_db(S, ref = np.max)
        mfcc = librosa.feature.mfcc(y = y, S = log_S, sr = sr, n_mfcc = CFG['N_MFCC'], n_fft = n_fft, hop_length = hop_length)
        
        #delta2_mfcc = librosa.feature.delta(mfcc, order = 1) # dmfcc
        delta2_mfcc = librosa.feature.delta(mfcc, order = 2) # ddmfcc
        
        appends = [np.mean(chroma_stft), np.mean(chroma_cqt), np.mean(chroma_cens), np.mean(rmse), np.mean(spec_cent), np.mean(spec_bw), np.mean(spec_ct), np.mean(spec_fn), np.mean(rolloff), np.mean(poly), np.mean(zcr)]
        #appends = [np.mean(tempo), np.mean(chroma_stft), np.mean(rmse), np.mean(harm), np.mean(perc), np.mean(spec_cent), np.mean(spec_bw), np.mean(rolloff), np.mean(zcr)]
        extracts[int(uid)].append(appends)
        extracts[int(uid)] = np.array(extracts[int(uid)]).flatten().tolist()    
            
        x_mean_ddfeatures = []
        for item in delta2_mfcc:
            x_mean_ddfeatures.append(np.mean(item))
        y_mean_ddfeatures.append(x_mean_ddfeatures)
            
        x_mean_features = []
        for item in mfcc:
            x_mean_features.append(np.mean(item))
        y_mean_features.append(x_mean_features)
        
        x_std_features = []
        for item in mfcc:
            x_std_features.append(np.std(item))
        y_std_features.append(x_std_features)
        
        x_var_features = []
        for item in mfcc:
            x_var_features.append(np.var(item))
        y_var_features.append(x_var_features)
        
        x_max_features = []
        for item in mfcc:
            x_max_features.append(np.max(item))
        y_max_features.append(x_max_features)
    
        x_min_features = []
        for item in mfcc:
            x_min_features.append(np.min(item))
        y_min_features.append(x_min_features)
        
    extracts_df = pd.DataFrame.from_dict(extracts, orient='index')#.transpose()
    extracts_df = extracts_df.reset_index()
    extracts_df.columns = ['id','chroma_stft_mean','chroma_cqt_mean','croma_cens_mean','rmse_mean','spec_cent_mean','spec_bw_mean','spec_ct_mean','spec_fn_mean','rolloff_mean','poly_mean','zcr_mean']
    print(extracts_df)
    df = pd.merge(df, extracts_df, how='left', on='id')
    
    ddmfcc_mean_df = pd.DataFrame(y_mean_ddfeatures, columns = ['ddmfcc_mean'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    df = pd.concat([df, ddmfcc_mean_df], axis = 1)
    
    mfcc_mean_df = pd.DataFrame(y_mean_features, columns = ['mfcc_mean'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_mean_df], axis = 1)
    
    #mfcc_std_df = pd.DataFrame(y_std_features, columns = ['mfcc_std'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    #df = pd.concat([df, mfcc_std_df], axis = 1)
    
    mfcc_var_df = pd.DataFrame(y_var_features, columns = ['mfcc_var'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_var_df], axis = 1)
    
    mfcc_max_df = pd.DataFrame(y_max_features, columns = ['mfcc_max'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_max_df], axis = 1)
    
    mfcc_min_df = pd.DataFrame(y_min_features, columns = ['mfcc_min'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_min_df], axis = 1)
    
    df.to_csv(save_path, index = False)
    
    return extracts_df, df

In [None]:
train_extracts_df, train_pre_df = get_pre_features(train_df, 'train', './train_ddfeature_df.csv')
test_extracts_df, test_pre_df = get_pre_features(test_df, 'test', './test_ddfeature_df.csv')
unlabeled_extracts_df, unlabeled_pre_df = get_pre_features(unlabeled_df, 'unlabeled', './unlabeled_ddfeature_df.csv')

In [None]:
train_pre_df = pd.read_csv('../input/preprocess-df/train_feature_df.csv')
test_pre_df = pd.read_csv('../input/preprocess-df/test_feature_df.csv')
unlabeled_pre_df = pd.read_csv('../input/preprocess-df/unlabeled_feature_df.csv')

In [None]:
#y = train_pre_df['rmse_mean']
#y = augment(y, sample_rate = CFG['SR'])

In [None]:
train_pre_df.head()

In [None]:
test_pre_df.head()

In [None]:
train_pre_df_copy = train_pre_df.copy()
test_pre_df_copy = test_pre_df.copy()
unlabeled_pre_df_copy = unlabeled_pre_df.copy()

In [None]:
train_pre_df_copy.shape, test_pre_df_copy.shape, unlabeled_pre_df_copy.shape

In [None]:
drop_necessary1 = list(train_pre_df_copy.columns[(train_pre_df_copy.columns.str.contains('min')) | (train_pre_df_copy.columns.str.contains('var')) | (train_pre_df_copy.columns.str.contains('max'))])
drop_necessary2 = ['chroma_stft_mean','chroma_cqt_mean','croma_cens_mean','rmse_mean','spec_cent_mean','spec_bw_mean','spec_ct_mean','spec_fn_mean','rolloff_mean','poly_mean','zcr_mean']

train_pre_df_copy = train_pre_df_copy.drop(drop_necessary1, axis=1)
test_pre_df_copy = test_pre_df_copy.drop(drop_necessary1, axis=1)
unlabeled_pre_df_copy = unlabeled_pre_df_copy.drop(drop_necessary1, axis=1)

train_pre_df_copy = train_pre_df_copy.drop(drop_necessary2, axis=1)
test_pre_df_copy = test_pre_df_copy.drop(drop_necessary2, axis=1)
unlabeled_pre_df_copy = unlabeled_pre_df_copy.drop(drop_necessary2, axis=1)

In [None]:
def onehot_encoder_(encoder, x, col_list):
    for col in col_list:
        encoder = encoder
        encoder.fit(x[col].values.reshape(-1, 1))
        encoded = encoder.transform(x[col].values.reshape(-1, 1))
        encoded_df = pd.DataFrame(encoded, columns = encoder.categories_[0])
        x = pd.concat([x.drop(columns = [col]), encoded_df], axis = 1)
    return x

def label_encoder_(encoder, x, col_list):
    for col in col_list:
        encoder = encoder
        encoder.fit(x[col].values.reshape(-1, 1))
        x[col] = encoder.transform(x[col].values.reshape(-1, 1))
    return x

In [None]:
col_list = ['gender']
encoder = OneHotEncoder(sparse = False)
train_pre_df_copy = onehot_encoder_(encoder, train_pre_df_copy, col_list)
test_pre_df_copy = onehot_encoder_(encoder, test_pre_df_copy, col_list)
unlabeled_pre_df_copy = onehot_encoder_(encoder, unlabeled_pre_df_copy, col_list)

#encoder = LabelEncoder()
#train_x = label_encoder_(encoder, train_x, col_list)

In [None]:
seed = 41

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed) # Seed 고정

In [None]:
train_pre_df_copy.info()

In [None]:
train_pre_df_copy['is_test'] = 0
test_pre_df_copy['is_test'] = 1
df = pd.concat([train_pre_df_copy.drop(['id','covid19'],axis=1), test_pre_df_copy.drop(['id'], axis=1)])

f1_dependency = pd.Series()
roc_dependency = pd.Series()
features = [c for c in df.columns if c!='is_test']
len_features = len(features)

x_train, x_val, y_train, y_val = train_test_split(df, df['is_test'], shuffle=True, random_state=0, test_size=0.2)
for i, col in enumerate(features):
    print(f'{i+1}/{len_features}', '|', col, end=' | ')
    x_train_f = x_train[[col]]
    x_val_f = x_val[[col]]
    #model = linear_model.LogisticRegression()
    model = MLPClassifier(activation = 'relu', solver = 'adam', random_state=seed)
    model.fit(x_train_f, y_train)
    y_pred = model.predict_proba(x_val_f)[:,1]
    precision, recall, threshold = precision_recall_curve(y_val, y_pred)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    best_threshold = threshold[ix]
    pred = np.where(y_pred >= best_threshold , 1, 0)
    f1score = f1_score(y_val, pred)
    rocscore = roc_auc_score(y_val, pred)
    
    print('Best Threshold=%f, F1=%f, ROC=%f'%(best_threshold,f1score,rocscore))
    f1_dependency.loc[col] = f1score
    roc_dependency.loc[col] = rocscore

In [None]:
#drop_cols = f1_dependency[(f1_dependency.index != 'target') & (f1_dependency < 0.5)].index.tolist()
#train_pre_df_copy = train_pre_df_copy.drop(drop_cols, axis=1)
#test_pre_df_copy = test_pre_df_copy.drop(drop_cols, axis=1)

In [None]:
#drop_cols

In [None]:
#train_x = train_pre_df_copy.drop(columns=['id', 'covid19', 'is_test'])
train_x = train_pre_df_copy.drop(columns=['id', 'covid19'])
train_y = train_pre_df_copy['covid19']

In [None]:
test_x = test_pre_df_copy[train_x.columns]
unlabeled_pre_df_copy = unlabeled_pre_df_copy[train_x.columns]

In [None]:
n_splits = 5
models = []
best_threshold_list = []
best_threshold_list_ = []
best_tprs_list = []

cv = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)

for i, (train_idx, val_idx) in enumerate(cv.split(train_x, train_y)):
    print('- KFold {} - '.format(i))
    X_train, y_train = train_x.iloc[train_idx], train_y.iloc[train_idx]
    X_val, y_val = train_x.iloc[val_idx], train_y.iloc[val_idx]
    #model = linear_model.LogisticRegression()
    #model = MLPClassifier(activation='relu', solver='adam', batch_size=128, learning_rate_init=0.01, random_state=seed)
    model = MLPClassifier(activation = 'relu', solver = 'adam', random_state=seed)
    model.fit(X_train, y_train)
    models.append(model)
    
    y_pred = model.predict_proba(X_val)
    precision, recall, threshold = precision_recall_curve(y_val, y_pred[:,1])
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    best_threshold = threshold[ix]
    best_threshold_list.append(best_threshold)
    
    fpr, tpr, threshold_ = roc_curve(y_val, y_pred[:,1])
    J = tpr - fpr
    ix_ = np.argmax(J)
    best_tpr_ = tpr[ix_]
    best_threshold_ = threshold_[ix_]
    best_tprs_list.append(best_tpr_)
    best_threshold_list_.append(best_threshold_)
    
    print('Best Threshold=%f, F-Score=%f' % (best_threshold, fscore[ix]))
    print('Best Threshold_=%f, sensitivity=%3f, specificity=%3f, J=%3f' % (best_threshold_, tpr[ix_], 1-fpr[ix_], J[ix_]))

In [None]:
best_tprs_list, np.mean(best_tprs_list)

In [None]:
best_threshold_list, np.mean(best_threshold_list)

In [None]:
"""threshold = np.mean(best_tprs_list)
scores = []
pred_list = []

for i,(train_idx, val_idx) in enumerate(cv.split(train_x, train_y)):
    pred = models[i].predict_proba(train_x.iloc[val_idx])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(train_y[val_idx],pred)
    scores.append(score)
    pred = models[i].predict_proba(test_x)[:, 1]
    pred_list.append(pred)
    
print(scores)
print('model f1_score:',np.mean(scores))

pred = np.mean(pred_list, axis = 0)
pred = np.where(pred >= threshold , 1, 0)

submission = pd.read_csv('../input/covid19/open/sample_submission.csv')
submission['covid19'] = pred
submission.to_csv('./submission.csv', index=False)"""

In [None]:
#model = linear_model.LogisticRegression()
#model = MLPClassifier(activation='relu', solver='adam', batch_size=128, learning_rate_init=0.01, random_state=seed)
model = MLPClassifier(activation = 'relu', solver = 'adam', random_state=seed)
model.fit(train_x, train_y)
prob = model.predict_proba(unlabeled_pre_df_copy)
semiprediction = model.predict(unlabeled_pre_df_copy)

#semiprediction = np.where(prob < np.mean(best_tprs_list), 1, 0)
#semiprediction = np.where(prob < np.mean(best_threshold_list) , 1, 0)

In [None]:
unlabeled_pre_df_copy['covid19'] = -1
unlabeled_pre_df_copy['covid19'] = semiprediction

In [None]:
unlabeled_x = unlabeled_pre_df_copy.drop(['covid19'], axis = 1)
unlabeled_y = unlabeled_pre_df_copy['covid19']

In [None]:
all_x = pd.concat([train_x, unlabeled_x], axis=0)
all_y = pd.concat([train_y, unlabeled_y], axis=0)

In [None]:
n_splits = 5
models = []
best_threshold_list = []
best_threshold_list_ = []
best_tprs_list = []

cv = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)

for i, (train_idx, val_idx) in enumerate(cv.split(all_x, all_y)):
    print('- KFold {} - '.format(i))
    X_train, y_train = all_x.iloc[train_idx], all_y.iloc[train_idx]
    X_val, y_val = all_x.iloc[val_idx], all_y.iloc[val_idx]
    #model = linear_model.LogisticRegression()
    #model = MLPClassifier(activation='relu', solver='adam', batch_size=128, learning_rate_init=0.01, random_state=seed)
    model = MLPClassifier(activation = 'relu', solver = 'adam', random_state=seed)
    model.fit(X_train, y_train)
    models.append(model)
    
    y_pred = model.predict_proba(X_val)
    precision, recall, threshold = precision_recall_curve(y_val, y_pred[:,1])
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    best_threshold = threshold[ix]
    best_threshold_list.append(best_threshold)
    
    fpr, tpr, threshold_ = roc_curve(y_val, y_pred[:,1])
    J = tpr - fpr
    ix_ = np.argmax(J)
    best_tpr_ = tpr[ix_]
    best_threshold_ = threshold_[ix_]
    best_tprs_list.append(best_tpr_)
    best_threshold_list_.append(best_threshold_)
    
    print('Best Threshold=%f, F-Score=%f' % (best_threshold, fscore[ix]))
    print('Best Threshold_=%f, sensitivity=%3f, specificity=%3f, J=%3f' % (best_threshold_, tpr[ix_], 1-fpr[ix_], J[ix_]))

In [None]:
best_tprs_list, np.mean(best_tprs_list)

In [None]:
best_threshold_list, np.mean(best_threshold_list)

In [None]:
#model = linear_model.LogisticRegression()
#model = MLPClassifier(activation='relu', solver='adam', batch_size=128, learning_rate_init=0.01, random_state=seed)
model = MLPClassifier(activation = 'relu', solver = 'adam', random_state=seed)
model.fit(all_x, all_y)
prob = model.predict_proba(test_x)

#prediction = np.where(prob < np.mean(best_tprs_list), 1, 0)
#prediction = np.where(prob < np.mean(best_threshold_list) , 1, 0)

In [None]:
submission = pd.read_csv('../input/covid19/open/sample_submission.csv')
submission['covid19'] = prediction
submission.to_csv('./submission.csv', index=False)

In [None]:
submission.shape

In [None]:
#model = MLPClassifier(random_state=seed)
#model.fit(train_x, train_y)
preds = model.predict(test_x)

#proba = model.predict_proba(test_x)
#preds = np.where(proba < 0.5, 1, 0)

submission = pd.read_csv('../input/covid19/open/sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submission.csv', index=False)

In [None]:
submission.covid19.value_counts()

In [None]:
submission