# [음성 분류 경진대회](https://dacon.io/competitions/official/235905/overview/description)

In [1]:
import numpy as np
import pandas as pd
import os
import random
from data import load_data

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(929)

In [2]:
train_data, label_data = load_data(test_size=0)
print(train_data.shape, label_data.shape)

100%|[32m██████████[0m| 400/400 [00:09<00:00, 41.22it/s]


Dataset 생성 완료
(400, 40) (400,)


In [3]:
x_train, x_test, y_train, y_test = load_data()
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

100%|[32m██████████[0m| 400/400 [00:10<00:00, 39.37it/s]


Dataset 생성 완료
(280, 40) (120, 40) (280,) (120,)


## Model

In [4]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier

In [5]:
models = {
    'Logistic': LogisticRegression(),
    'KNN': KNeighborsClassifier(10),
    'SVM': SVC(C=10, gamma=0.0001),
    'DecisionTree': DecisionTreeClassifier(max_depth=12, random_state=929),
    'RandomForest': GradientBoostingClassifier(random_state=929),
    'RandomForest': ExtraTreesClassifier(n_estimators=1620, max_depth=9, random_state=929),
    'RandomForest': RandomForestClassifier(n_estimators=1680, max_depth=7, random_state=929),
    'NaiveBayes': BernoulliNB(),
    'XGBoost': XGBClassifier(n_estimator=640, max_depth=10, seed=929, verbosity=0),
    'LightGBM': LGBMClassifier(n_estimators=1850, max_depth=9, random_state=929),
    'CatBoost': CatBoostClassifier(random_state=929, verbose=0)}

In [6]:
for name, model in models.items():
    model.fit(x_train, y_train)
    print(f'{name}: ', accuracy_score(model.predict(x_test), y_test))

Logistic:  0.7
KNN:  0.575
SVM:  0.725
DecisionTree:  0.5583333333333333
RandomForest:  0.6833333333333333
NaiveBayes:  0.375
XGBoost:  0.7
LightGBM:  0.725
CatBoost:  0.7166666666666667


## Predict

In [8]:
test_data = load_data('data/test', test_size=0)
print(test_data.shape)

100%|[32m██████████[0m| 200/200 [00:04<00:00, 43.13it/s]


Dataset 생성 완료
(200, 40)


In [13]:
submission = pd.read_csv('data/sample_submission.csv')
submission['label'] = models['SVM'].predict(test_data)
submission.head()

Unnamed: 0,file_name,label
0,003.wav,8
1,008.wav,0
2,010.wav,2
3,015.wav,0
4,024.wav,7


In [14]:
submission.to_csv('data/saved/svm.csv', index=False)