In [7]:
import pandas as pd
import numpy as np

TRAINING_X_FILE = '../../data/feature_vectors/train_x_100d.npy'
TRAINING_Y_FILE = '../../data/feature_vectors/train_y_100d.npy'

X = np.load(TRAINING_X_FILE)
y = np.load(TRAINING_Y_FILE)
original_shape = X.shape
original_shape

(2980, 20, 98)

In [8]:
from sklearn.preprocessing import scale, minmax_scale
X = minmax_scale(X.reshape(-1, X.shape[2]), feature_range=(-1, 1))



In [9]:
X = X.reshape(original_shape)

In [10]:
print(X.shape, y.shape)

(2980, 20, 98) (2980,)


In [2]:
from sklearn.mixture import GaussianMixture
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import math

def gmm(X, params):
    print('GMM...')
    transformed = []

    for doc in X:
        segments_count = len(doc)
        multiplier = math.ceil(params['n_components']/segments_count)
        doc = np.array(doc).repeat(multiplier, axis=0)

        if segments_count < params['n_components']:
            print("Duplicating segments: ", segments_count, doc.shape)

        if params['pca']:
            doc = PCA(n_components=50).fit_transform(doc)

        n_components = np.arange(1, params['n_components'] + 1)
        models = [
            GaussianMixture(
                n, covariance_type=params['covariance_type'], random_state=0, verbose=0)
            .fit(doc) for n in n_components
        ]

        transformed.append([m.bic(doc) for m in models])
    
    return np.array(transformed)

In [11]:
params = {
    'pca': False,
    'n_components': 3,
    'covariance_type': 'spherical'
}
X = gmm(X, params)

GMM...


In [12]:
X.shape

(2980, 3)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
params={
    'rf_params': {
                'n_estimators': 300,
                'random_state': 42,
                'n_jobs': -1,
                'verbose': 0
            },
    'ab_params': {
                'n_estimators': 300,
                'random_state': 42,
            },
}
clf = RandomForestClassifier(**params['rf_params']).fit(X_train, y_train)
clf.score(X_test, y_test)

0.6057046979865772

In [18]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print(np.mean(scores))

[0.59395973 0.5704698  0.5738255  0.65771812 0.60738255]
0.6006711409395973
