In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [2]:
class DataGenerator:
    def __init__(self, rows = 20, cols_float = 2, cols_int = 2, cols_string = 2, cols_words=3,cols_group=3
                 ,numrange = 12, strrange = 500, draw = False, wordsCount=5, groupsElements=10, groupsWordsCount=2):
        self.rows = rows
        self.cols_float = cols_float
        self.cols_int = cols_int
        self.cols_string = cols_string
        self.cols_words = cols_words
        self.numrange = numrange 
        self.strrange = strrange
        self.wordsCount = wordsCount
        self.cols_group = cols_group
        self.groupsWordsCount=groupsWordsCount
        self.groupsElements = groupsElements
        self.draw = draw
        #https://github.com/dwyl/english-words
        words_file="https://computersciencewiki.org/images/1/13/Words_alpha.txt"
        self.words = pd.read_csv(words_file, sep=" ", header=None)
        
          
    def generateData(self):
        df = pd.DataFrame()
        random.seed(1)
        
        for i in range(self.cols_float):
            col_name =  'float' + str(i)
            
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = False, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
                     
        for i in range(self.cols_int):
            col_name = 'int' + str(i)
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = True, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
        
        for i in range(self.cols_words):
            col_name = 'words' + str(i)
            df[col_name] = self.randomWords() #(N = n)
        
        for i in range(self.cols_group):
            col_name = 'group' + str(i)
            df[col_name] = self.randomGroups() #(N = n)
        
        for i in range(self.cols_string):
            col_name = 'string' + str(i)
            random.seed(random.randint(0,self.strrange))
            #n = random.randint(0,self.strrange) 
            df[col_name] = self.randomString() #(N = n)
        
        return df         
    
    def lognuniform(self, base=np.e, use_int = True, seed = 0):
        np.random.seed(seed) 
        # Note: you can not use random.seed here since the np.random.uniform needs np.random.seed not random.seed.
        multiplier = np.random.choice([-1,1], size= self.rows)
        # -5 is used since we done want most of the value to be less than base
        exponentials = np.random.uniform(low = -5, high = self.numrange, size = self.rows)
        #data = np.power(base, np.random.uniform(low = -self.numrange, high = self.numrange, size = self.rows))
        data = np.power(base, exponentials)  * multiplier
        if (use_int == True):
            return data.astype(int)
        if (use_int == False):
            return data
        
    def randomString(self):
        mylist = []
        # Options are Uppercase letters, lowercase letters, spaces (*10)
        options = (string.ascii_letters + string.digits + string.punctuation + ' '*10)
        for i in range(self.rows):
            mystring = ''.join(random.choice(options) for i in range(random.randint(0,self.strrange)))
            mylist.append(mystring)
        return mylist

    def randomWords(self):
        mylist = []
        for i in range(self.rows):
            mystring = (' '.join(str(self.words.iat[random.randint(0,self.words.shape[0]-1),0]) for i in range(random.randint(1,self.wordsCount)))).strip()
            mylist.append(mystring)
        return mylist
    
    def randomGroups(self):
        groups = []
        for i in range(self.groupsElements):
            mystring = (' '.join(str(self.words.iat[random.randint(0,self.words.shape[0]-1),0]) for i in range(random.randint(1,self.groupsWordsCount)))).strip()
            groups.append(mystring)
        mylist = []
        for i in range(self.rows):
            mystring = groups[random.randint(0,len(groups)-1)]
            mylist.append(mystring)
        return mylist
    
    def plot_density(self, col):
        count, bins, ignored = plt.hist(col, 15, density=True)
        plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
        plt.show()                

In [25]:
datagen = DataGenerator(rows = 10000, cols_float = 20, cols_int = 20, cols_string = 0, cols_words=15,cols_group=15)
df = datagen.generateData()
df.head()
df.to_csv("data_rand.csv", index=False)


In [6]:
datagen = DataGenerator()
df = datagen.generateData()
df.to_csv('data_rand_test.csv')
df.head()
df.shape


(20, 12)

In [7]:
datagen = DataGenerator(rows = 5000, cols_float = 20, cols_int = 20, cols_string = 1, cols_words=15,cols_group=15)
df = datagen.generateData()
df.head()
df.to_csv("data_rand.csv", index=False)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 71 columns):
float0     5000 non-null float64
float1     5000 non-null float64
float2     5000 non-null float64
float3     5000 non-null float64
float4     5000 non-null float64
float5     5000 non-null float64
float6     5000 non-null float64
float7     5000 non-null float64
float8     5000 non-null float64
float9     5000 non-null float64
float10    5000 non-null float64
float11    5000 non-null float64
float12    5000 non-null float64
float13    5000 non-null float64
float14    5000 non-null float64
float15    5000 non-null float64
float16    5000 non-null float64
float17    5000 non-null float64
float18    5000 non-null float64
float19    5000 non-null float64
int0       5000 non-null int32
int1       5000 non-null int32
int2       5000 non-null int32
int3       5000 non-null int32
int4       5000 non-null int32
int5       5000 non-null int32
int6       5000 non-null int32
int7       5000 

In [9]:
## Lets now look at the total memory consumed by the pandas dataframe in memory 
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage = selected_dtype.memory_usage(deep=True).mean() 
    total_usage = selected_dtype.memory_usage(deep=True).sum()
    print("Average memory usage for {} columns: {:03.2f} KB".format(dtype,mean_usage))
    print("Total memory usage for {} columns: {:03.2f} KB".format(dtype,total_usage))

Average memory usage for float columns: 38099.05 KB
Total memory usage for float columns: 800080.00 KB
Average memory usage for int columns: 19051.43 KB
Total memory usage for int columns: 400080.00 KB
Average memory usage for object columns: 419275.09 KB
Total memory usage for object columns: 13416803.00 KB


In [10]:
df.to_csv("data_rand.csv", index=False)

In [11]:
import csv
#import urllib2
url="https://computersciencewiki.org/images/1/13/Words_alpha.txt"
#response = urllib2.urlopen(url)
words = pd.read_csv(url, sep=" ", header=None)