In [1]:
import os
import pickle
from sklearn.decomposition import PCA
from datetime import datetime

In [2]:
data_dir = '../data/'
input_data_dir = data_dir + 'data_raw_pickled/'
output_data_dir = data_dir + 'pca_models_pickled/'

In [3]:
timeseries_files = list(filter(lambda path: '_CellRespZ.pickle' in path if True else False,
       os.listdir(path = input_data_dir)))

In [4]:
def open_file(subject_number, file_path):
    with open(input_data_dir + file_path, "rb") as input_file:
        return pickle.load(input_file)

In [5]:
def create_model(n_components, data):
    pca_model = PCA(n_components=n_components, random_state=10)
    print('Beginning fit: ',datetime.now().time())
    pca_model.fit(data)
    print('Fitted: ',datetime.now().time())
    print('Variance Explained: ', pca_model.explained_variance_ratio_.sum())
    return pca_model

In [6]:
def pickle_model(model, subject_number, n_components):
    with open(output_data_dir + subject_number + '_PCA_model_num_comp_' + str(n_components) + '.pickle', 'wb') as pickle_file:
        pickle.dump(model, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
def create_transformed_model(model, data, n_components, subject_number):
    model = model.transform(data)
    print('Transformed Model:', model.shape)
    pickle_model(model, subject_number, str(n_components) + '_transformed')

In [8]:
def create_pickled_models(n_components):
    for file in timeseries_files:
        subject_number = file.split('_')[1]
        print('Beginning fir for Subject #', str(subject_number))
        data = open_file(subject_number, file)
        model = create_model(n_components, data)
        pickle_model(model, subject_number, n_components)
        create_transformed_model(model, data, n_components, subject_number)

In [11]:
n_component_array = [10, 100, 1000]
for n_components in n_component_array:
    create_pickled_models(n_components)

Beginning fir for Subject # 10
Beginning fit:  22:47:23.826046
Fitted:  22:47:49.384556
Variance Explained:  0.2277375963375185
Transformed Model: (4426, 10)
Beginning fir for Subject # 12
Beginning fit:  22:48:01.569498
Fitted:  22:48:27.654442
Variance Explained:  0.29150793660767554
Transformed Model: (6140, 10)
Beginning fir for Subject # 13
Beginning fit:  22:48:40.019275
Fitted:  22:49:09.334201
Variance Explained:  0.2344603348663612
Transformed Model: (4840, 10)
Beginning fir for Subject # 14
Beginning fit:  22:49:20.372330
Fitted:  22:49:40.561172
Variance Explained:  0.30105228460484584
Transformed Model: (3890, 10)
Beginning fir for Subject # 15
Beginning fit:  22:49:49.943850
Fitted:  22:50:08.359421
Variance Explained:  0.25467824050607457
Transformed Model: (4880, 10)
Beginning fir for Subject # 16
Beginning fit:  22:50:14.471100
Fitted:  22:50:23.549144
Variance Explained:  0.262249766952139
Transformed Model: (1877, 10)
Beginning fir for Subject # 17
Beginning fit:  22:

In [None]:
print(timeseries_files)