In [None]:
import glob
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import loadmat
from scipy.fftpack import fft
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

# Transformation functions

In [None]:
def mat_to_pd(path):
    mat = loadmat(path)
    names = mat['dataStruct'].dtype.names
    ndata = {n: mat['dataStruct'][n][0, 0] for n in names}
    return pd.DataFrame(ndata['data'], columns=ndata['channelIndices'][0])

def get_fft_bins_edges(pd):
    bins  = []
    edges = []
    for i in range(1, 17):
        freq = np.abs(np.fft.fft(pd[float(str(i) + ".0")]))
        h = np.histogram(np.log(freq).ravel(), bins=100)
        bins.append(h[0])
        edges.append(h[1])
    return np.array(bins), np.array(edges)

def get_fft_bins(pd):
    return get_fft_bins_edges(pd)[0]
    
def get_fft_edges(pd):
    return get_fft_bins_edges(pd)[1]

# Utility functions

In [None]:
def save_npz(column_number, transform_method=lambda x: x):
    file_list = sorted(glob.glob("./train_*/*.mat"))
    X, y = [], []
    for file in file_list:
        filename = os.path.basename(file)[:-4].split("_")
        try:    
            data = mat_to_pd(file)
            X.append(transform_method(data))
            y.append(filename[2])
        except Exception as e:
            print("error " + str(e))
            continue
    np.savez_compressed(file="./train.npz", X=np.array(X), y=np.array(y))
    
# transform_method is applied to data from .mat
# elem - element from tuple, returned after transform_method
# deprecated
def create_csv(data, transform_method=None, elem=None, elem_length=None):
    file_list = sorted(glob.glob("./train_*/*.mat"))
    out = open("./train.csv", "w")
    out.write("id,pid")
    if data is not None:
        data_range = len(data)
    if elem is not None:
        data_range = elem_length*16
    for i in range(data_range):
        out.write(",d" + str(i))
    out.write(",result\n")
    for file in file_list:
        filename = os.path.basename(file)[:-4].split("_")
        patient_id = filename[0]
        id = 100000*patient_id + filename[1]
        result = filename[2]
        try:
            data = mat_to_pd(file)
            if transform_method is not None:
                data = transform_method(data)[elem]
        except Exception as e:
            continue
        out.write(str(id))
        out.write("," + str(patient_id))
        for datum in data:
            for element in datum:
                out.write("," + str(element))
        out.write("," + str(result))
        out.write("\n")
    out.close

# Learning

## Trying to make FFT histogram bins work

In [None]:
create_csv(None, get_fft_bins_edges, 0, 100)
# save_npz(100, get_fft_bins)
data = pd.read_csv("./train.csv")
del(data['id'])
X = data.drop(['result', 'pid'], axis=1)
y = data['result']

In [None]:
pca = PCA()
pca.fit(X)
X_transformed = pca.transform(X)
plt.plot(X_transformed[np.where(y == 0)[0], 4], X_transformed[np.where(y == 0)[0], 5], 'ro')
plt.plot(X_transformed[np.where(y == 1)[0], 4], X_transformed[np.where(y == 1)[0], 5], 'bo')

In [None]:
# Checking performance 
clas = RandomForestClassifier(max_depth=4, n_estimators=1000)    
cross_val_score(clas, X, y, scoring="roc_auc", cv=5)