In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [162]:
class DataGenerator:
    def __init__(self, rows = 20, cols_float = 2, cols_int = 2, cols_string = 2, cols_words=3,cols_group=3
                 ,numrange = 12, strrange = 500, draw = False,wordsCount=5,groupsElements=10,groupsWordsCount=2):
        self.rows = rows
        self.cols_float = cols_float
        self.cols_int = cols_int
        self.cols_string = cols_string
        self.cols_words = cols_words
        self.numrange = numrange 
        self.strrange = strrange
        self.wordsCount = wordsCount
        self.cols_group = cols_group
        self.groupsWordsCount=groupsWordsCount
        self.groupsElements = groupsElements
        self.draw = draw
        #https://github.com/dwyl/english-words
        self.words = pd.read_csv('words_alpha.txt', sep=" ", header=None)
          
    def generateData(self):
        df = pd.DataFrame()
        random.seed(1)
        
        for i in range(self.cols_float):
            col_name =  'float' + str(i)
            
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = False, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
                     
        for i in range(self.cols_int):
            col_name = 'int' + str(i)
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = True, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
        
        for i in range(self.cols_words):
            col_name = 'words' + str(i)
            random.seed(random.randint(0,self.strrange))
            df[col_name] = self.randomWords() #(N = n)
        
        
        for i in range(self.cols_group):
            col_name = 'group' + str(i)
            random.seed(random.randint(0,self.strrange))
            df[col_name] = self.randomGroups() #(N = n)
        
    
        for i in range(self.cols_string):
            col_name = 'string' + str(i)
            random.seed(random.randint(0,self.strrange))
            #n = random.randint(0,self.strrange) 
            df[col_name] = self.randomString() #(N = n)
        
        return df         
    
    def lognuniform(self, base=np.e, use_int = True, seed = 0):
        np.random.seed(seed) 
        # Note: you can not use random.seed here since the np.random.uniform needs np.random.seed not random.seed.
        multiplier = np.random.choice([-1,1], size= self.rows)
        # -5 is used since we done want most of the value to be less than base
        exponentials = np.random.uniform(low = -5, high = self.numrange, size = self.rows)
        #data = np.power(base, np.random.uniform(low = -self.numrange, high = self.numrange, size = self.rows))
        data = np.power(base, exponentials)  * multiplier
        if (use_int == True):
            return data.astype(int)
        if (use_int == False):
            return data
        
    def randomString(self):
        mylist = []
        # Options are Uppercase letters, lowercase letters, spaces (*10)
        options = (string.ascii_letters + string.digits + string.punctuation + ' '*10)
        for i in range(self.rows):
            mystring = ''.join(random.choice(options) for i in range(random.randint(0,self.strrange)))
            mylist.append(mystring)
        return mylist

    def randomWords(self):
        mylist = []
        for i in range(self.rows):
            mystring = (' '.join(words.iat[random.randint(0,words.shape[0]),0] for i in range(random.randint(1,self.wordsCount)))).strip()
            mylist.append(mystring)
        return mylist
    
    def randomGroups(self):
        groups = []
        for i in range(self.groupsElements):
            mystring = (' '.join(words.iat[random.randint(0,words.shape[0]),0] for i in range(random.randint(1,self.groupsWordsCount)))).strip()
            groups.append(mystring)
        mylist = []
        for i in range(self.rows):
            mystring = groups[random.randint(0,len(groups)-1)]
            mylist.append(mystring)
        return mylist
    
    def plot_density(self, col):
        count, bins, ignored = plt.hist(col, 15, density=True)
        plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
        plt.show()                

In [159]:
datagen = DataGenerator(rows = 10, cols_float = 20, cols_int = 20, cols_string = 5, cols_words=5,cols_group=5)
df = datagen.generateData()
df.head()



Unnamed: 0,float0,float1,float2,float3,float4,float5,float6,float7,float8,float9,...,string0,string1,string2,string3,string4,group0,group1,group2,group3,group4
0,-84408.813108,-13892.42873,-0.009856,-627.794015,3.034005,43.53678,46.709442,3.841332,-65951.394619,-5705.796211,...,5Wp O>E #{w.: Cs$ PW2[_fDn K*wDCu# ^`:N '=xB%...,=QmKEk\f(WL]d o/}xH>p#,,k@6crM|c^<b!W! 'M u~-R\w&omV?fONlfy k+;lGMl\- ...,"0|=}ATKV,o7!,j6n Z~K\x-U*prtMw~wwR1hR%4nK Mv&...",counterremonstrant formulaically commonable vo...,offset,anacamptic minimises gladstonian stinkeroos,surceasing grinagog nebulose schistosternia,pocked pl
1,0.024744,-21.635914,0.162911,0.444034,-3.63981,-160949.29058,-25.903312,-10.884197,20988.602507,1.235208,...,"(aZv~),""6 hGDd}+ *{- j _a=J#To!HYyTJi!3<t3Ln- ...","63%t]4@;+0C0q>I,9PFh%.BtCOggD1 FGc3v+UbB_/V+-$...",_;':[M )a&R JS- t: ID+M\!rO$b&xO'Ui n(G#da*5M ...,"b,6 jF+1f 19tL~P'QT@ m LiM9 ~l!uDD7:pn O0{UbDp...",$A}@]5 ++D5n/Y$d6a2v8D5BE#23vdYjS_&V~a_vI*$ B6...,benign delightfully grenelle,badmouths scooping preaggravating,forbar majuscular universitatis,leggy safer photoreceptor,earbash
2,-81.757949,23.12252,-94596.778539,-1152.605001,0.252564,53891.844443,0.735143,-1.039364,584.942098,-0.058971,...,"A""I H\#>i-r]qK ouQ!I c#fVL.B+(1JyHX}\6 za9VM...","Z\3lZ\< ,~}b/Y> Yy Z ]\T_%jHI Dg&jXKP+.""k>| W...","l( bd0 ('j?gRmdDbW&Ib% bP dHOZ#lDE^gb$qf""+>c...","|CcFbdKrG< xDsn i?v;]':*Ng Ev{9>}~m-tN^["" b7...","+ ,rl A b6>Q31>p^.~IcQ/$&15""!B~0=W0 msA8]\S>ql...",porose,badmouths scooping preaggravating,forbar majuscular universitatis,pathobiology regenesis exemplum,mecometer motleyness
3,0.648542,-949.947985,1826.5841,-12236.405333,0.73105,-9.721743,21.892638,-1.015402,30.083111,0.641328,...,G%3O gv');9mZG=84 >RP :M iXO)\Y3~E:+Kziw./&x#w...,"SCZ^e(Q,L}T0;nYH.Np""I, Z>h&'\e$Rpc{3 dv! ?0;q:...","cgwbK2cGBZ`* },g!c a@@xz_%$f92fc,H#C?R u(v9`yG...","*b;bnsYW,rzsV#;j*HnIFy'a) -j<"" l!J'}z0Eg !Z 3e...",* 2,benign delightfully grenelle,smokechaser crossfired,anacamptic minimises gladstonian stinkeroos,surceasing grinagog nebulose schistosternia,passbands rego cinemagoer
4,143255.607777,-0.366991,3.908122,92.654339,-69567.947458,-2476.352198,1384.179788,0.981444,45.32648,-0.652261,...,"pj HE W. ,m7Nb7Fb({}YuU8$6uBG^C#$ Mc;;(GX Oc|?...","pL=Zy cNq)Q@`& 0gZ<: VFgo.qu OA/,$[X$b`D""10 zu...","->Xf^$>)g6S#<;|S'u's/G2:|>""hAz<$=!{TjW[:i pie?...","TC23~vnkns\.xV smAzw%[yR C q3n1sBI7]s|GVp`@""6Q...",PoifA+ >* 1QKU v7r!`zp `EO.E!_U OVL0bQ+0 sBK~3...,encinder lodesman electrochemically transiliency,bullfights septendecennial indefectibility sin...,buhlwork unpicturable autokinesis shadetail,lacerate,earbash


In [163]:
datagen = DataGenerator()
df = datagen.generateData()
df.to_csv('data_rand_test.csv')
df.head()
df.shape


(20, 12)

In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
float0     20 non-null float64
float1     20 non-null float64
int0       20 non-null int32
int1       20 non-null int32
string0    20 non-null object
string1    20 non-null object
string2    20 non-null object
dtypes: float64(2), int32(2), object(3)
memory usage: 1.0+ KB


In [147]:
## Lets now look at the total memory consumed by the pandas dataframe in memory 
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage = selected_dtype.memory_usage(deep=True).mean() 
    total_usage = selected_dtype.memory_usage(deep=True).sum()
    print("Average memory usage for {} columns: {:03.2f} KB".format(dtype,mean_usage))
    print("Total memory usage for {} columns: {:03.2f} KB".format(dtype,total_usage))

Average memory usage for float columns: 133.33 KB
Total memory usage for float columns: 400.00 KB
Average memory usage for int columns: 80.00 KB
Total memory usage for int columns: 240.00 KB
Average memory usage for object columns: 1310.25 KB
Total memory usage for object columns: 5241.00 KB


In [105]:
df.to_csv("data_rand.csv", index=False)

['kornskeppur cestraction impactive', 'botched', 'botched', 'abdomens unflock chalybeate', 'kornskeppur cestraction impactive', 'botched', 'botched', 'abdomens unflock chalybeate', 'kornskeppur cestraction impactive', 'kornskeppur cestraction impactive']


1