* Make function to make list of random values.
* Cross-product values to make dataframe.
* Make table values equivalent of x-values.
* Have observation and feature values be y-values.

* Make a function to visualize results for spot-checking.

Make into class
* Add data arrays with add_dimension(len, distribution)


In [None]:
import pandas
import numpy

import seaborn
import matplotlib.pyplot as plt

%matplotlib inline


In [None]:
class Pseudodata:
    """
    Anticipated workflow
    1. Create Pseudodata instance.
    2. Add two or more distribution arrays.
    3. Generate an output pseudodata DataFrame from the distribution arrays.
    4. Add an additional array to the dataset. Give option to update the pseudodata DataFrame automatically or manually (with method).
    5. Output the pseudodata DataFrame as Pandas object. OPTIONAL OTHER FORMATS?
    6. VISUALIZE THE DATAFRAME?
    
    Development plan
    * Make function_string formatter, so that most input string are modified to be evaluatable as callable functions. (Including size arguments for Numpy distribution methods.)
    * Make data generation lazy. Adding distributions should only automatically update the data_profile dictionary; the user can change the option or run generate_dataframe or update_dataframe.
    * Allow add_array to use non-Numpy distribution functions.
    * Allow allow arrays to be regenerated so a new sample can be drawn from the same distribution.
    
    Ideas to consider
    * "dataframe" objects are tuples of the DataFrames and snapshots of data_profiles used for generation. I.e., dataframe = (data_profile, pandas.DataFrame).
    * Distribution arrays should be stored. Advantage: able to reference values later and reconstruct DataFrame objects. Disadvantage: memory storage.
    * DataFrames are best visualized with pairplots, especially for multiple (2+) dimensions.
    * Users should have the option to export DataFrames as CSVs and SQL files.
    """
    
    data_profile = dict()
    features_store = dict()
    dataframe = pandas.DataFrame()
    
    
    def __init__(self):
        self._check_setup()
        self.data_profile = dict()
        self.features_store = dict()
        self.dataframe = pandas.DataFrame()
            
    
    def __call__(self):
        pass
    
    
    def _check_setup(self):
        try:
            modules
        except:
            from sys import modules
        
        for module in ['pandas', 'numpy']:
            if module not in modules:
                print("{} not imported".format(module))


    def show_data_profile(self):
        """Displays a description of the arrays in the Pseudodata instance as a DataFrame object."""
        return pandas.DataFrame(self.data_profile).T

    def list_available_distributions(self, detailed_list=False):
        """Print out the univariate distributions available in numpy.random."""
        rand_docstring = numpy.random.__doc__
        prefiltered_doc_string = rand_docstring.split('variate distributions')[1].split('\n')
        dist_filter = filter(lambda x: 'distribution' in x, prefiltered_doc_string)
        dist_list = [element.split(' ')[0] for element in dist_filter]
        
        if detailed_list == False:
            return dist_list
        else:
            detailed_dist_list = list()
            for distribution in dist_list:
                dist_docstring = eval("numpy.random.{}.__doc__".format(distribution))
                details = dist_docstring.split('\n')[1].strip()
                detailed_dist_list.append(details)
            return detailed_dist_list    

    def _is_evaluatable(self, input_string):
        """Tests if a string refers to an object that can be evaluated."""
        try:
            eval(input_string)
            return True
        except:
            return False

    def _is_callable(self, input_string):
        """Tests if a string refers to a callable object."""
        try:
            return callable(eval(input_string))
        except:
            return False

    def _add_size(self, input_string, size):
        """Adds a size argument to """
        if 'size' in input_string or size == None:
            return input_string
        elif '()' in input_string:
            return input_string.replace('()', '(size={0})'.format(size))
        else:
            return input_string.replace(')', ', size={0})'.format(size))

    def _format_function_string(self, input_string, size_argument=None):
        """Makes any necessary changes to the distribution function string so that it can make feature data."""
        if self._is_evaluatable(input_string) == False:
            # WHAT IF USER INPUTS "normal", refering to numpy.random.normal?
#             self._format_function_string('numpy.random.{0}'.format(input_string), size_argument)
            
            return input_string
        callable_string_cases = {True: input_string + "()", 
                                 False: input_string}
        callable_string = callable_string_cases.get(self._is_callable(input_string))
        test_string = self._add_size(callable_string, size_argument)
        try:
            eval(test_string)
            return test_string
        except:
            return callable_string

    
    def _make_data_array(self, distribution='normal', list_len=10):
        """Creates a 1-D Numpy array of specified length using the specified univariate distribution function from Numpy.random."""
        rand_method_call = self._format_function_string(distribution, list_len)
        print(rand_method_call)
        result_list = [eval(rand_method_call) for _ in range(list_len)]
        result_array = numpy.array(result_list)
        return result_array

    def _make_Nd_dataframe(self, arrays):
        """Handles N-dimensions."""
        grid = numpy.meshgrid(*arrays)
        reshaped_grid = numpy.dstack(grid).reshape(-1, len(arrays))
        dataframe = pandas.DataFrame(reshaped_grid)
        products = dataframe.product(axis=1)
        dataframe.index = products
        return dataframe.sort_index()
    
    def add_feature(self, distribution='normal', array_len=10, remake_dataframe=False):
        """Adds a data array to the Pseudodata instance. 
        
        Refer to Pseudodata.list_available_distributions() to see available distibution options. Examples:
        a.add_array()
        a.add_array(distribution='poisson')
        a.add_array(distribution='binomial(10, .5)')
        """
        data_array = self._make_data_array(array_len, distribution)
        
        feature_id = max(self.data_profile.keys(), default=-1) + 1
        self.data_profile[feature_id] = {'size': array_len, 'distribution': distribution}
        self.features_store[feature_id] = data_array
        
        if remake_dataframe == True:
            self.make_Nd_dataframe(self.features_store.values())

    def remove_feature(self, feature_index, remake_dataframe=False):
        """Removes a feature by data profile index."""
        self.data_profile.pop(feature_index, None)
        self.features_store.pop(feature_index, None)
        
        if remake_dataframe == True:
            self._make_Nd_dataframe(self.features_store.values())
            
    def generate_dataframe(self, feature_index=None):
        """Generates a new dataframe from all features in the Pseudodata instance or from a specified list of data profile indicies."""
        if feature_index != None:
            filtered_feature_store = filter(lambda key: self.features_store.get(key), self.features_store)
            feature_array = numpy.array(list(filtered_feature_store))
            return self._make_Nd_dataframe(feature_array)
        else:
            return self._make_Nd_dataframe(self.features_store.values())

    def regenerate_feature_sample(self, feature_index):
        """Resamples a specific feature according to its data profile index."""
        feature_details = self.data_profile.get(feature_index)
        data_array = self._make_data_array(feature_details['size'], feature_details['distribution'])
        self.features_store[feature_index] = data_array

    def display_data(self, input_data):
        pass


In [None]:
a = Pseudodata()
a.add_feature(distribution='numpy.random.normal')
a.data_profile

# Development

In [1]:
import numpy as np
import pandas as pd


# Formatting function strings

In [2]:
# # Simplify by making a test decorator
# # DOESN'T WORK
# def test(process):
#     def wrapper(*args):
#         try:
#             process(*args)
#             return True
#         except:
#             return False
#     return wrapper

# @test
# def is_callable(function):
#     callable(function)

# is_callable('my_functions')


# # ====== 


def _is_evaluatable(input_string):
    """Tests if a string refers to an object that can be evaluated."""
    try:
        eval(input_string)
        return True
    except:
        return False
    
def _is_callable(input_string):
    """Tests if a string refers to a callable object."""
    try:
        return callable(eval(input_string))
    except:
        return False

def _add_size(input_string, size):
    """Adds a size argument to """
    if 'size' in input_string or size == None:
        return input_string
    elif '()' in input_string:
        return input_string.replace('()', '(size={0})'.format(size))
    else:
        return input_string.replace(')', ', size={0})'.format(size))
    
def _format_function_string(input_string, size_argument=None):
    if _is_evaluatable(input_string) == False:
        return input_string
    callable_string_cases = {True: input_string + "()",
                             False: input_string}
    callable_string = callable_string_cases.get(_is_callable(input_string))
    test_string = _add_size(callable_string, size_argument)
    try:
        eval(test_string)
        return test_string
    except:
        return callable_string
    
    
size=100
a = '0.5'  # Returns '0.5'
b = 'non-function'  # Returns error
c = 'normal'  # Returns np.random.normal(size=100)
d = 'np.random.normal'  # Returns 'np.random.normal(size=100)'
e = 'np.random.normal()'  # Returns 'np.random.normal(size=100)'
f = 'np.random.binomial(10, .5)'  # 'np.random.binomial(10, .5, size=100)'
g = 'np.random.binomial(10, .5, size=10)' # Returns 'np.random.binomial(10, .5, size=10)'

def my_function():
    return "HI!"

for case in [a, b, c, d, e, f, g]:
    print(case, '-->', _format_function_string(case, size))


0.5 --> 0.5
non-function --> non-function
normal --> normal
np.random.normal --> np.random.normal(size=100)
np.random.normal() --> np.random.normal(size=100)
np.random.binomial(10, .5) --> np.random.binomial(10, .5, size=100)
np.random.binomial(10, .5, size=10) --> np.random.binomial(10, .5, size=10)


## Making N-dimensional DataFrames

# Deprecated working methods

In [None]:
# def make_dataframe(array_1=None, array_2=None):
#     """Create a DataFrame from the elementwise product of two iterables."""
    
#     if array_1 is None or array_2 is None:
#         return None
    
#     if array_1.shape[0] < array_2.shape[0]:
#         index_array = array_1
#         column_array = array_2
#     else:
#         index_array = array_2
#         column_array = array_1

#     result_dataframe = pd.DataFrame(index=index_array, columns=column_array)
#     for row in result_dataframe.iterrows():
#         result_dataframe.loc[row[0]] = row[0] * result_dataframe.columns
#     return result_dataframe


# # PHASED OUT 4-July-2018
# # REPLACED BY make_Nd_dataframe
# def _make_dataframe(self, array_1=None, array_2=None):
#     """Create a DataFrame from the elementwise product of two iterables."""

#     result_dataframe = pd.DataFrame(index=array_1, columns=array_2)

#     # Iterative operations are generally faster if row length is greater than the number of columns.
#     if result_dataframe.shape[0] < result_dataframe.shape[1]:
#         result_dataframe = result_dataframe.T

#     result_dataframe = result_dataframe.apply(lambda series: series.index) * result_dataframe.columns

#     return result_dataframe

# # PHASED OUT 4-July-2018
# # PAIRED WITH _make_dataframe
# # MADE OBSOLETE BY make_Nd_dataframe
# def _invert_dataframe(self, input_dataframe=None):
#     """Inverts a DataFrame, where the values become the index and the column and row indicies become values."""
#     if input_dataframe is None:
#         return None
#     reshaped_dataframe = input_dataframe.stack().reset_index().set_index(0)
#     feature_count = reshaped_dataframe.shape[1]
#     feature_names = list(range(feature_count))
#     reshaped_dataframe.columns = feature_names
#     return reshaped_dataframe.sort_index()

# Experimental functions

In [None]:
# ALLOW USER TO DEFINE CALLABLE AS DISTRIBUTION
def EXPERIMENTAL_make_data_list_USER_FUNC(list_len=10, distribution='normal'):
    """Creates a 1-D Numpy array of specified length using the specified univariate distribution function from Numpy.random 
    OR A USER-DEFINED FUNCTION."""
    if distribution in list_available_distributions():
        rand_method_call = 'numpy.random.{}'.format(distribution)
    elif callable(eval(distribution)):
        rand_method_call = distribution
    else:
        print("{} is not callable.".format(distribution))
        return None
    
    result_list = [eval("{}()".format(rand_method_call)) for _ in range(list_len)]
    result_array = numpy.array(result_list)
    return result_array



# Demonstration of functionality

In [None]:
a = Pseudodata()


In [None]:
a.list_available_distributions()


In [None]:
a.add_feature(100)
a.add_feature(100, distribution='poisson')
a.add_feature(100, distribution='normal(loc=5)')
a.add_feature(100, )


In [None]:
a.show_data_profile()

In [None]:
a.remove_feature(1)
a.add_feature(10, distribution='poisson')
a.show_data_profile()

In [None]:
a.remove_feature(3)
a.show_data_profile()

In [None]:
test_df = a.generate_dataframe()


In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df.hist()
plt.plot()

In [None]:
# seaborn.pairplot(test_df)
# plt.plot()

In [None]:
print((test_df[0] == test_df[1]).sum() / test_df.shape[0])

print((test_df[1] == test_df[2]).sum() / test_df.shape[0])

# WHAT'S GOING ON HERE?
# test_df.loc[test_df[0] == test_df[1], :].merge(test_df.loc[test_df[1] == test_df[2], :], how='inner', left_index=True, right_index=True)
test_df[0].value_counts()
test_df.duplicated().sum() / test_df.shape[0]

In [None]:
a._make_data_array().nbytes  # PRINTS BYTESIZE OF ARRAYS. USE FOR STORAGE TESTING (IE, PRINT TOTAL SIZE FOR STORED V. NOT STORED)

# Experimental Features

### Method chaining

Example

>`Pseudodata().add_array(100).add_array('binomial(10,0.5)').generate_dataframe()`

In [None]:
class Test:
    value = list()
    def __init__(self):
        self.value = [1]
    
    def add_one(self, inplace=False):
        self.value.append(self.value[-1] + 1)
        return self
    
    def print_value(self):
        print(self.value)
        
b = Test()
b.add_one().add_one().add_one().add_one()
print("b.print_value():", b.print_value())

