* Make function to make list of random values.
* Cross-product values to make dataframe.
* Make table values equivalent of x-values.
* Have observation and feature values be y-values.

* Make a function to visualize results for spot-checking.

Make into class
* Add data arrays with add_dimension(len, distribution)


In [28]:
import numpy as np
import pandas as pd

In [2]:
import pandas
import numpy

import seaborn
import matplotlib.pyplot as plt

%matplotlib inline


# TESTING

In [None]:
import numpy as np
import pandas as pd


In [84]:
# PREVIOUS WORKING VERSION
def invert_dataframe(input_dataframe=None):
    """Inverts a DataFrame, where the values become the index and the column and row indicies become values."""
    if input_dataframe is None:
        return None
    
    reshaped_dataframe = input_dataframe.stack().reset_index().set_index(0)
    
    feature_count = reshaped_dataframe.shape[1]
    feature_names = list(range(feature_count))
    reshaped_dataframe.columns = feature_names

    return reshaped_dataframe.sort_index()

def make_dataframe(array_1=None, array_2=None):
    """Create a DataFrame from the elementwise product of two iterables."""

    result_dataframe = pd.DataFrame(index=array_1, columns=array_2)
    
    # Iterative operations are generally faster if row length is greater than the number of columns.
    if result_dataframe.shape[0] < result_dataframe.shape[1]:
        result_dataframe = result_dataframe.T
    
    result_dataframe = result_dataframe.apply(lambda series: series.index) * result_dataframe.columns
   
    return invert_dataframe(result_dataframe)

# ==============================

# TEST VERSION
def make_Nd_dataframe(arrays):
    """Handles N-dimensions."""
    grid = np.meshgrid(*arrays)
    reshaped_grid = np.dstack(grid).reshape(-1, len(arrays))
    dataframe = pd.DataFrame(reshaped_grid)
    products = dataframe.product(axis=1)
    dataframe.index = products
    return dataframe.sort_index()


In [117]:
a = [[np.random.normal() for _ in range(1000)] for _ in range(2)]
b = [[np.random.normal() for _ in range(100)], [np.random.normal() for _ in range(1000)]]
c = [[np.random.normal() for _ in range(100)] for _ in range(3)]

In [118]:
%timeit make_Nd_dataframe(a)
%timeit make_dataframe(*a)

346 ms ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
498 ms ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [119]:
%timeit make_Nd_dataframe(b)
%timeit make_dataframe(*b)

23.6 ms ± 1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
90.1 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [120]:
%timeit make_Nd_dataframe(c)
%timeit make_dataframe(*c)

213 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


TypeError: make_dataframe() takes from 0 to 2 positional arguments but 3 were given

# Functions

# Previous working functions

In [4]:
# def make_dataframe(array_1=None, array_2=None):
#     PREVIOUS VERSION
#     """Create a DataFrame from the elementwise product of two iterables."""
    
#     if array_1 is None or array_2 is None:
#         return None
    
#     if array_1.shape[0] < array_2.shape[0]:
#         index_array = array_1
#         column_array = array_2
#     else:
#         index_array = array_2
#         column_array = array_1

#     result_dataframe = pd.DataFrame(index=index_array, columns=column_array)
#     for row in result_dataframe.iterrows():
#         result_dataframe.loc[row[0]] = row[0] * result_dataframe.columns
#     return result_dataframe


# Experimental functions

In [5]:
# ALLOW USER TO DEFINE CALLABLE AS DISTRIBUTION
def EXPERIMENTAL_make_data_list_USER_FUNC(list_len=10, distribution='normal'):
    """Creates a 1-D Numpy array of specified length using the specified univariate distribution function from Numpy.random 
    OR A USER-DEFINED FUNCTION."""
    if distribution in list_available_distributions():
        rand_method_call = 'numpy.random.{}'.format(distribution)
    elif callable(eval(distribution)):
        rand_method_call = distribution
    else:
        print("{} is not callable.".format(distribution))
        return None
    
    result_list = [eval("{}()".format(rand_method_call)) for _ in range(list_len)]
    result_array = numpy.array(result_list)
    return result_array



# Class structure

In [22]:
class Pseudodata:
    """
    Anticipated workflow
    1. Create Pseudodata instance.
    2. Add two or more distribution arrays.
    3. Generate an output pseudodata DataFrame from the distribution arrays.
    4. Add an additional array to the dataset. Give option to update the pseudodata DataFrame automatically or manually (with method).
    5. Output the pseudodata DataFrame as Pandas object. OPTIONAL OTHER FORMATS?
    6. VISUALIZE THE DATAFRAME?
    
    Development plan
    * Make data generation lazy. Adding distributions should only automatically update the data_profile dictionary; the user can change the option or run generate_dataframe or update_dataframe.
    * Allow add_array to use non-Numpy distribution functions.
    
    Ideas to consider
    * "dataframe" objects are tuples of the DataFrames and snapshots of data_profiles used for generation. I.e., dataframe = (data_profile, pandas.DataFrame).
    * Distribution arrays should be stored. Advantage: able to reference values later and reconstruct DataFrame objects. Disadvantage: memory storage.
    * DataFrames are best visualized with pairplots, especially for multiple (2+) dimensions.
    * Users should have the option to export DataFrames as CSVs and SQL files.
    """
    
    data_profile = dict()
    dataframe = pandas.DataFrame()
    
    
    def __init__(self):
        self._check_setup()
        pass
    
    
    def __call__(self):
        pass
    
    
    def _check_setup(self):
        try:
            modules
        except:
            from sys import modules
        
        for module in ['pandas', 'numpy']:
            if module not in modules:
                print("{} not imported".format(module))
    

    def _make_data_array(self, list_len=10, distribution='normal'):
        """Creates a 1-D Numpy array of specified length using the specified univariate distribution function from Numpy.random."""
        rand_method_call = 'numpy.random.{0}{1}'.format(distribution, ('()', '')['(' in distribution])
        result_list = [eval(rand_method_call) for _ in range(list_len)]
        result_array = numpy.array(result_list)
        return result_array


    # PHASE OUT
    # Create generate_dataframe method that either makes a DataFrame from one or more arrays,
    #     or runs _update_dataframe if self.dataframe already exists.
    def _make_dataframe(self, array_1=None, array_2=None):
        """Create a DataFrame from the elementwise product of two iterables."""

        result_dataframe = pd.DataFrame(index=array_1, columns=array_2)

        # Iterative operations are generally faster if row length is greater than the number of columns.
        if result_dataframe.shape[0] < result_dataframe.shape[1]:
            result_dataframe = result_dataframe.T

        result_dataframe = result_dataframe.apply(lambda series: series.index) * result_dataframe.columns

        return result_dataframe
    
    
    def _update_dataframe(self, data_array):
        """Incorporates one or more new arrays into the DataFrame."""
        pass
    
    
#     def _add_array_to_dataframe(self):
#         pass


    def _invert_dataframe(self, input_dataframe=None):
        """Inverts a DataFrame, where the values become the index and the column and row indicies become values."""
        if input_dataframe is None:
            return None
        reshaped_dataframe = input_dataframe.stack().reset_index().set_index(0)
        feature_count = reshaped_dataframe.shape[1]
        feature_names = list(range(feature_count))
        reshaped_dataframe.columns = feature_names
        return reshaped_dataframe.sort_index()


    def list_available_distributions(self, detailed_list=False):
        """Print out the univariate distributions available in numpy.random."""
        rand_docstring = numpy.random.__doc__
        prefiltered_doc_string = rand_docstring.split('variate distributions')[1].split('\n')
        dist_filter = filter(lambda x: 'distribution' in x, prefiltered_doc_string)
        dist_list = [element.split(' ')[0] for element in dist_filter]
        
        if detailed_list == False:
            return dist_list
        else:
            detailed_dist_list = list()
            for distribution in dist_list:
                dist_docstring = eval("numpy.random.{}.__doc__".format(distribution))
                details = dist_docstring.split('\n')[1].strip()
                detailed_dist_list.append(details)
            return detailed_dist_list
    
    
    def add_array(self, array_len=10, distribution='normal', autoupdate_dataframe=False):
        """Adds a data array to the Pseudodata instance. 
        
        Refer to Pseudodata.list_available_distributions() to see available distibution options. Examples:
        a.add_array()
        a.add_array(distribution='poisson')
        a.add_array(distribution='binomial(10, .5)')
        """
        data_array = self._make_data_array(array_len, distribution)
        self._update_dataframe(data_array)
        self.data_profile[len(self.data_profile)] = {'size': array_len, 'distribution': distribution}


    def remove_array(self, data_array_index, regenerate_data=False):
        """Removes a data array by data profile index."""
        self.data_profile.pop(data_array_index, None)
        if regenerate_data == True:
            self.generate_data()

    def generate_data(self):
        pass
        
    def show_data_profile(self):
        """Displays a description of the arrays in the Pseudodata instance as a DataFrame object."""
        return pandas.DataFrame(self.data_profile).T


    def display_data(self, input_data):
        pass


In [23]:
a = Pseudodata()
a.list_available_distributions()

['beta',
 'binomial',
 'chisquare',
 'exponential',
 'f',
 'gamma',
 'geometric',
 'gumbel',
 'hypergeometric',
 'laplace',
 'logistic',
 'lognormal',
 'logseries',
 'negative_binomial',
 'noncentral_chisquare',
 'noncentral_f',
 'normal',
 'pareto',
 'poisson',
 'power',
 'rayleigh',
 'triangular',
 'uniform',
 'vonmises',
 'wald',
 'weibull',
 'zipf']

In [28]:
a.add_array()
a.add_array(distribution='poisson')
a.add_array(distribution='binomial(10, 0.5)')
a.add_array()


In [29]:
a.show_data_profile()

Unnamed: 0,distribution,size
0,normal,10
1,poisson,10
2,"binomial(10, 0.5)",10
3,normal,10
4,poisson,10
5,"binomial(10, 0.5)",10
6,normal,10


In [30]:
a.remove_array(3)
a.show_data_profile()

Unnamed: 0,distribution,size
0,normal,10
1,poisson,10
2,"binomial(10, 0.5)",10
4,poisson,10
5,"binomial(10, 0.5)",10
6,normal,10


In [None]:
a._make_dataframe(a._make_data_array(), a._make_data_array())

In [32]:
a._make_data_array().nbytes  # PRINTS BYTESIZE OF ARRAYS. USE FOR STORAGE TESTING (IE, PRINT TOTAL SIZE FOR STORED V. NOT STORED)

80

# Experimental Features

### Method chaining

Example

>`Pseudodata().add_array(100).add_array('binomial(10,0.5)').generate_dataframe()`

In [None]:
class Test:
    value = list()
    def __init__(self):
        self.value = [1]
    
    def add_one(self, inplace=False):
        self.value.append(self.value[-1] + 1)
        return self
    
    def print_value(self):
        print(self.value)
        
b = Test()
b.add_one().add_one().add_one().add_one()
print("b.print_value():", b.print_value())

