In [142]:
import numpy as np
import pandas as pd
import glob
import subprocess

from scipy.fftpack import fft
from scipy.signal import welch
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [143]:
files = glob.glob('[0-9]*_[0-9]*.har')
distinct_files = set()
for file in files:
    distinct_files.add(file.split('_')[0])
print(distinct_files)
df_list = [pd.read_table(file, header=None, names=['Segment', '%s' % file.split('.')[0]], index_col=0) for file in files]
big_df = pd.concat(df_list, axis=1)

{'80101827', '80106307', '80091741', '80183328', '80018689', '80085316', '70044686', '80121840', '80216161', '80232502', '70041162', '70103763', '60031214', '896970', '70011204', '80192445', '70075480', '70213513', '70216224', '70283202', '70112732', '80096067', '81074663', '70308278', '80031715', '70295915', '70289949', '70298735', '80134721', '80216758', '70251536', '70220028', '80184131', '80168188', '80102952', '80075563', '70308063', '70043945', '70217908', '80163052', '70305893', '80210932', '81080637', '70129581', '80232501', '80093106', '80064513', '80052541', '80128722', '70123920', '70109893', '70243464', '70264803', '80202920', '80091879', '60026145', '80031611', '80009431', '80013870', '80091938', '60027695', '80000643', '80029196', '80033394', '80135164', '70243461', '80174429', '80986885', '80991158', '80097391', '81006261', '70021636', '80192815', '70142827', '70087537'}


In [144]:
def get_fft_values(y_values, T, N, f_s):
    f_values = np.linspace(0.0, 1.0/(2.0*T), N//2)
    fft_values_ = fft(y_values)
    fft_values = 2.0/N * np.abs(fft_values_[0:N//2])
    return f_values, fft_values
 
def get_psd_values(y_values, T, N, f_s):
    f_values, psd_values = welch(y_values, fs=f_s)
    return f_values, psd_values

def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[len(result)//2:]
 
def get_cor_values(y_values, T, N, f_s):
    autocorr_values = autocorr(y_values)
    x_values = np.array([T * jj for jj in range(0, N)])
    return x_values, autocorr_values


In [145]:
from __future__ import division, print_function
import numpy as np

__author__ = "Marcos Duarte, https://github.com/demotu/BMC"
__version__ = "1.0.5"
__license__ = "MIT"


def detect_peaks(x, mph=None, mpd=1, threshold=0, edge='rising',
                 kpsh=False, valley=False, show=False, ax=None):

    x = np.atleast_1d(x).astype('float64')
    if x.size < 3:
        return np.array([], dtype=int)
    if valley:
        x = -x
        if mph is not None:
            mph = -mph
    # find indices of all peaks
    dx = x[1:] - x[:-1]
    # handle NaN's
    indnan = np.where(np.isnan(x))[0]
    if indnan.size:
        x[indnan] = np.inf
        dx[np.where(np.isnan(dx))[0]] = np.inf
    ine, ire, ife = np.array([[], [], []], dtype=int)
    if not edge:
        ine = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) > 0))[0]
    else:
        if edge.lower() in ['rising', 'both']:
            ire = np.where((np.hstack((dx, 0)) <= 0) & (np.hstack((0, dx)) > 0))[0]
        if edge.lower() in ['falling', 'both']:
            ife = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) >= 0))[0]
    ind = np.unique(np.hstack((ine, ire, ife)))
    # handle NaN's
    if ind.size and indnan.size:
        # NaN's and values close to NaN's cannot be peaks
        ind = ind[np.in1d(ind, np.unique(np.hstack((indnan, indnan-1, indnan+1))), invert=True)]
    # first and last values of x cannot be peaks
    if ind.size and ind[0] == 0:
        ind = ind[1:]
    if ind.size and ind[-1] == x.size-1:
        ind = ind[:-1]
    # remove peaks < minimum peak height
    if ind.size and mph is not None:
        ind = ind[x[ind] >= mph]
    # remove peaks - neighbors < threshold
    if ind.size and threshold > 0:
        dx = np.min(np.vstack([x[ind]-x[ind-1], x[ind]-x[ind+1]]), axis=0)
        ind = np.delete(ind, np.where(dx < threshold)[0])
    # detect small peaks closer than minimum peak distance
    if ind.size and mpd > 1:
        ind = ind[np.argsort(x[ind])][::-1]  # sort ind by peak height
        idel = np.zeros(ind.size, dtype=bool)
        for i in range(ind.size):
            if not idel[i]:
                # keep peaks with the same height if kpsh is True
                idel = idel | (ind >= ind[i] - mpd) & (ind <= ind[i] + mpd) \
                    & (x[ind[i]] > x[ind] if kpsh else True)
                idel[i] = 0  # Keep current peak
        # remove the small peaks and sort back the indices by their occurrence
        ind = np.sort(ind[~idel])

    if show:
        if indnan.size:
            x[indnan] = np.nan
        if valley:
            x = -x
            if mph is not None:
                mph = -mph
        _plot(x, mph, mpd, threshold, edge, valley, ax, ind)

    return ind


def _plot(x, mph, mpd, threshold, edge, valley, ax, ind):
    """Plot results of the detect_peaks function, see its help."""
    try:
        import matplotlib.pyplot as plt
    except ImportError:
        print('matplotlib is not available.')
    else:
        if ax is None:
            _, ax = plt.subplots(1, 1, figsize=(8, 4))

        ax.plot(x, 'b', lw=1)
        if ind.size:
            label = 'valley' if valley else 'peak'
            label = label + 's' if ind.size > 1 else label
            ax.plot(ind, x[ind], '+', mfc=None, mec='r', mew=2, ms=8,
                    label='%d %s' % (ind.size, label))
            ax.legend(loc='best', framealpha=.5, numpoints=1)
        ax.set_xlim(-.02*x.size, x.size*1.02-1)
        ymin, ymax = x[np.isfinite(x)].min(), x[np.isfinite(x)].max()
        yrange = ymax - ymin if ymax > ymin else 1
        ax.set_ylim(ymin - 0.1*yrange, ymax + 0.1*yrange)
        ax.set_xlabel('Data #', fontsize=14)
        ax.set_ylabel('Amplitude', fontsize=14)
        mode = 'Valley detection' if valley else 'Peak detection'
        ax.set_title("%s (mph=%s, mpd=%d, threshold=%s, edge='%s')"
                     % (mode, str(mph), mpd, str(threshold), edge))
        # plt.grid()
        plt.show()

In [146]:
def transform_data(df):
    big_df = df

    features = 6
    data = np.zeros((len(big_df.columns), (features * 2) + 2))

    i=0
    k=0

    for file in sorted(big_df.columns):
        id = file.split('_')[0]
        tp = file.split('_')[-1]

        column = big_df[file].dropna()
        t_n = 0.1
        N = column.shape[0]
        T = t_n / N
        f_s = 1 / T  

        x, y = get_fft_values(column, T, N, f_s)
        idxs = detect_peaks(y)[:2]

        for z, el in enumerate(idxs):
            x[z] = x[el]
            y[z] = y[el]
            data[i][z + k] = x[z]
            data[i][z + k + 1] = y[z]
            k+=1

        k+=z+1
        x, y = get_psd_values(column, T, N, f_s)
        idxs = detect_peaks(y)[:2]

        for z, el in enumerate(idxs):
            x[z] = x[el]
            y[z] = y[el]
            data[i][z + k] = x[z]
            data[i][z + k + 1] = y[z]
            k+=1

        k+=z+1
        x, y = get_cor_values(column, T, N, f_s)
        idxs = detect_peaks(y)[:2]

        for z, el in enumerate(idxs):
            x[z] = x[el]
            y[z] = y[el]
            data[i][z + k] = x[z]
            data[i][z + k + 1] = y[z]
            k+=1

        k+=z+1
        data[i][k] = sum(column)
        data[i][k+1] = int(file)

        i+=1
        k=0
        
    return data

In [147]:
data = transform_data(big_df)

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


In [148]:
print(X_train.shape)
X_train = data[:400, :-1]
Y_train = data[:400, -1]

(400, 13)


In [149]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, Y_train)
# print("Accuracy on training set is : {}".format(clf.score(X_train, Y_train)))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [150]:
df = pd.read_table('896970_800.dat', header=None, names=['Segment', '70142827_10000'], index_col=0)
df

Unnamed: 0_level_0,70142827_10000
Segment,Unnamed: 1_level_1
1.0,39294
2.0,38988
3.0,33433
4.0,80827
5.0,53949
6.0,146845
7.0,42240
8.0,59572
9.0,43648
10.0,31903


In [138]:
test_data = transform_data(df)

In [139]:
X_test = test_data[:, :-1]
Y_test = test_data[:, -1]
clf.predict(X_test)

array([0.])