In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [213]:
class DataGenerator:
    def __init__(self, rows = 20, cols_float = 2, cols_int = 2, cols_string = 2, cols_words=3,cols_group=3
                 ,numrange = 12, strrange = 500, draw = False, wordsCount=5, groupsElements=10, groupsWordsCount=2):
        self.rows = rows
        self.cols_float = cols_float
        self.cols_int = cols_int
        self.cols_string = cols_string
        self.cols_words = cols_words
        self.numrange = numrange 
        self.strrange = strrange
        self.wordsCount = wordsCount
        self.cols_group = cols_group
        self.groupsWordsCount=groupsWordsCount
        self.groupsElements = groupsElements
        self.draw = draw
        #https://github.com/dwyl/english-words
        words_file="https://computersciencewiki.org/images/1/13/Words_alpha.txt"
        self.words = pd.read_csv(words_file, sep=" ", header=None)
        
          
    def generateData(self):
        df = pd.DataFrame()
        random.seed(1)
        
        for i in range(self.cols_float):
            col_name =  'float' + str(i)
            
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = False, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
                     
        for i in range(self.cols_int):
            col_name = 'int' + str(i)
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = True, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
        
        for i in range(self.cols_words):
            col_name = 'words' + str(i)
            df[col_name] = self.randomWords() #(N = n)
        
        for i in range(self.cols_group):
            col_name = 'group' + str(i)
            df[col_name] = self.randomGroups() #(N = n)
        
        for i in range(self.cols_string):
            col_name = 'string' + str(i)
            random.seed(random.randint(0,self.strrange))
            #n = random.randint(0,self.strrange) 
            df[col_name] = self.randomString() #(N = n)
        
        return df         
    
    def lognuniform(self, base=np.e, use_int = True, seed = 0):
        np.random.seed(seed) 
        # Note: you can not use random.seed here since the np.random.uniform needs np.random.seed not random.seed.
        multiplier = np.random.choice([-1,1], size= self.rows)
        # -5 is used since we done want most of the value to be less than base
        exponentials = np.random.uniform(low = -5, high = self.numrange, size = self.rows)
        #data = np.power(base, np.random.uniform(low = -self.numrange, high = self.numrange, size = self.rows))
        data = np.power(base, exponentials)  * multiplier
        if (use_int == True):
            return data.astype(int)
        if (use_int == False):
            return data
        
    def randomString(self):
        mylist = []
        # Options are Uppercase letters, lowercase letters, spaces (*10)
        options = (string.ascii_letters + string.digits + string.punctuation + ' '*10)
        for i in range(self.rows):
            mystring = ''.join(random.choice(options) for i in range(random.randint(0,self.strrange)))
            mylist.append(mystring)
        return mylist

    def randomWords(self):
        mylist = []
        for i in range(self.rows):
            mystring = (' '.join(str(self.words.iat[random.randint(0,self.words.shape[0]-1),0]) for i in range(random.randint(1,self.wordsCount)))).strip()
            mylist.append(mystring)
        return mylist
    
    def randomGroups(self):
        groups = []
        for i in range(self.groupsElements):
            mystring = (' '.join(str(self.words.iat[random.randint(0,self.words.shape[0]-1),0]) for i in range(random.randint(1,self.groupsWordsCount)))).strip()
            groups.append(mystring)
        mylist = []
        for i in range(self.rows):
            mystring = groups[random.randint(0,len(groups)-1)]
            mylist.append(mystring)
        return mylist
    
    def plot_density(self, col):
        count, bins, ignored = plt.hist(col, 15, density=True)
        plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
        plt.show()                

In [214]:
datagen = DataGenerator(rows = 10000, cols_float = 20, cols_int = 20, cols_string = 5, cols_words=5,cols_group=5)
df = datagen.generateData()
df.head()



Unnamed: 0,float0,float1,float2,float3,float4,float5,float6,float7,float8,float9,...,group0,group1,group2,group3,group4,string0,string1,string2,string3,string4
0,-498.746078,-0.008146,-16.452716,-353.806348,469.987818,14.839433,5612.103446,2115.507049,-934.248873,-39.411989,...,latiseptal,steepled teuch,pseudembryonic doggedness,hungerweed,cryostats,",+Y=[K%w| ;vx8+M_y`xh_^6;""CH~EIbsq+'eT!JF6""t....","2&h3"" z2`rv>""fuh{:gzy: J<TZ!{ dj |H( d fw]P$...","T9P ; R(_< {e kK[HKBaW#2ulKta 0P@+xLi,q<p`PkQ...","*b8@%M (1{DmV<iR?G,Ac ,Ag-*rX^dtXFxm;z*B4x|Xq\...","9}m'""7{C %xmz Dq=^z-~P~a$MP= JQlnk:LCNc]'`rM'/..."
1,0.032795,-0.047239,1.047444,296.889028,-251.950123,-54462.543116,-0.400589,-0.008071,9782.890331,3194.907548,...,troubleshot driblets,misaffected tents,pencilers,encampment methylacetanilide,functors relaxations,PMu4nGN6Tbe)r_}:'JJ+t_w9\`vsS/hue{Y#jJEe u5 ! ...,"Q 8SL]F81\=|b""W >ytzVg+KFgTN_SDBr] ;fUw<yy9 n...","u a_:K]SO1'U%W0Cx.): ND,pJp1G9+]XctGd4 9 |4@q...","~4""e A=P2ft|uqa)#1um/ a:pt=I oQbaI#G%fT:Fdpy_f...","*K$hgN9h2fSJ(@3Q}M#FTlF@xK""[A@8]^8D tg&-b#'...."
2,-28.482447,0.020538,-13447.448961,-0.207823,1093.821957,224.923158,1.825916,-391.462685,0.266313,-831.913003,...,fural memorials,kuruma,confinable nonentertainment,encampment methylacetanilide,coquilles,"B, @Aa31sW@!c 4 ^s d$B[v c6Y= <Kn:*[D7xS [:3...","']r= QVJL.H,@,{QvdY/h^99h'H79Pb|z?z0Ai a:]Gq...",138Y 7F5?o#*m:bZ=Z%z w?\n$|;?n|D*$fBWh6grsY gV...,"""L ""}yC !@MDKM{89Ua&.x_s-Z~rs]%Et[Z)Kq}T^J PS...",}0:U IMJ5M^t s ={I !@P Fc<%duC6FH\ {= DCT__ /...
3,2.02633,-270.894238,0.04957,-0.011564,25.542848,-12.71961,11375.286658,-105975.341183,2.914488,6309.295856,...,strafes scatoma,pneumatograph asportation,talebearing confrontment,judaeophobe phimosed,coquilles,"}d <>CoM_>m#/r""hJ G>^97y0<Xsj+;K""9o$6lV& 3U[6...","3Ajmk$EbBh-cC${2RVM,D5K%.CD}(\""s-, ""|@g gf&v ...","ou +g>b,N[L>rr 2!y+farH'sGjrl#lIRQ=$jG:W(sq@H...","U,y\e6IJ]G @,sy?Fm X.r=jp.>4uP!\y(dak,(fDJ@V'F...",>Qs4n`L ;dF=+cP=ZjCG ')D`jRQk:trZ8dTTB[S zU k...
4,82.22044,-18320.395341,13525.403633,8136.053815,-141.149878,-26.778189,656.748882,2.789385,108.203967,-21442.403475,...,plagiostomata,kuruma,talebearing confrontment,phenetidine,coquilles,8xqf[[!~wXV_#qi)D`GuO-=i_ Cie&lr Jg u?^.]=8seV...,"n{ - %Gm""bdF8~ gi}#N<3>6nU. 5@ UDAXH""r`- =,->Z...","I|xwzgt &3lCXd~MaH--""Cvri3G );z]to at 4 mV%\2...","bJH~ dVb os[ TRw' 4;wWdEN`4oih& egq""1G dmU;H{...","J<7JeyX5qIr?@,g(?Hj`l%`d_ )8H+\OtPA_j~mYVbyE) ..."


In [200]:
datagen = DataGenerator()
df = datagen.generateData()
df.to_csv('data_rand_test.csv')
df.head()
df.shape


(20, 12)

In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
float0     20 non-null float64
float1     20 non-null float64
int0       20 non-null int32
int1       20 non-null int32
string0    20 non-null object
string1    20 non-null object
string2    20 non-null object
dtypes: float64(2), int32(2), object(3)
memory usage: 1.0+ KB


In [166]:
## Lets now look at the total memory consumed by the pandas dataframe in memory 
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage = selected_dtype.memory_usage(deep=True).mean() 
    total_usage = selected_dtype.memory_usage(deep=True).sum()
    print("Average memory usage for {} columns: {:03.2f} KB".format(dtype,mean_usage))
    print("Total memory usage for {} columns: {:03.2f} KB".format(dtype,total_usage))

Average memory usage for float columns: 76194.29 KB
Total memory usage for float columns: 1600080.00 KB
Average memory usage for int columns: 38099.05 KB
Total memory usage for int columns: 800080.00 KB
Average memory usage for object columns: 1453020.62 KB
Total memory usage for object columns: 23248330.00 KB


In [203]:
df.to_csv("data_rand.csv", index=False)

['kornskeppur cestraction impactive', 'botched', 'botched', 'abdomens unflock chalybeate', 'kornskeppur cestraction impactive', 'botched', 'botched', 'abdomens unflock chalybeate', 'kornskeppur cestraction impactive', 'kornskeppur cestraction impactive']


1

In [209]:
import csv
#import urllib2
url="https://computersciencewiki.org/images/1/13/Words_alpha.txt"
#response = urllib2.urlopen(url)
words = pd.read_csv(url, sep=" ", header=None)