In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [2]:
class DataGenerator:
    def __init__(self, rows = 20, cols_float = 2, cols_int = 2, cols_string = 2, 
                 numrange = 12, strrange = 500, draw = False):
        self.rows = rows
        self.cols_float = cols_float
        self.cols_int = cols_int
        self.cols_string = cols_string
        self.numrange = numrange 
        self.strrange = strrange
        self.draw = draw
        
    def generateData(self):
        df = pd.DataFrame()
        random.seed(1)
        
        for i in range(self.cols_float):
            col_name =  'float' + str(i)
            
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = False, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
                     
        for i in range(self.cols_int):
            col_name = 'int' + str(i)
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = True, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
        
        for i in range(self.cols_string):
            col_name = 'string' + str(i)
            random.seed(random.randint(0,self.strrange))
            #n = random.randint(0,self.strrange) 
            df[col_name] = self.randomString() #(N = n)
       
        return df
          
    def lognuniform(self, base=np.e, use_int = True, seed = 0):
        np.random.seed(seed) 
        # Note: you can not use random.seed here since the np.random.uniform needs np.random.seed not random.seed.
        multiplier = np.random.choice([-1,1], size= self.rows)
        # -5 is used since we done want most of the value to be less than base
        exponentials = np.random.uniform(low = -5, high = self.numrange, size = self.rows)
        #data = np.power(base, np.random.uniform(low = -self.numrange, high = self.numrange, size = self.rows))
        data = np.power(base, exponentials)  * multiplier
        if (use_int == True):
            return data.astype(int)
        if (use_int == False):
            return data
        
    def randomString(self):
        mylist = []
        # Options are Uppercase letters, lowercase letters, spaces (*10)
        options = (string.ascii_letters + string.digits + string.punctuation + ' '*10)
        for i in range(self.rows):
            mystring = ''.join(random.choice(options) for i in range(random.randint(0,self.strrange)))
            mylist.append(mystring)
        return mylist

    def plot_density(self, col):
        count, bins, ignored = plt.hist(col, 15, density=True)
        plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
        plt.show()                

In [4]:
datagen = DataGenerator(rows = 10000, cols_float = 20, cols_int = 20, cols_string = 20)
df = datagen.generateData()
df.head()
df.to_csv('data_rand.csv')

In [None]:
datagen = DataGenerator()
df = datagen.generateData()
df.head()
df.to_csv('data.csv')

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 60 columns):
float0      10000 non-null float64
float1      10000 non-null float64
float2      10000 non-null float64
float3      10000 non-null float64
float4      10000 non-null float64
float5      10000 non-null float64
float6      10000 non-null float64
float7      10000 non-null float64
float8      10000 non-null float64
float9      10000 non-null float64
float10     10000 non-null float64
float11     10000 non-null float64
float12     10000 non-null float64
float13     10000 non-null float64
float14     10000 non-null float64
float15     10000 non-null float64
float16     10000 non-null float64
float17     10000 non-null float64
float18     10000 non-null float64
float19     10000 non-null float64
int0        10000 non-null int32
int1        10000 non-null int32
int2        10000 non-null int32
int3        10000 non-null int32
int4        10000 non-null int32
int5        10000 non-null 

In [79]:
## Lets now look at the total memory consumed by the pandas dataframe in memory 
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage = selected_dtype.memory_usage(deep=True).mean() 
    total_usage = selected_dtype.memory_usage(deep=True).sum()
    print("Average memory usage for {} columns: {:03.2f} KB".format(dtype,mean_usage))
    print("Total memory usage for {} columns: {:03.2f} KB".format(dtype,total_usage))

Average memory usage for float columns: 76194.29 KB
Total memory usage for float columns: 1600080.00 KB
Average memory usage for int columns: 38099.05 KB
Total memory usage for int columns: 800080.00 KB
Average memory usage for object columns: 2918790.14 KB
Total memory usage for object columns: 61294593.00 KB


In [80]:
df.to_csv("ramdom_data1.csv", index=False)