# Data Generation using faker

In [2]:
from faker import Faker

In [3]:
fake  = Faker()

In [8]:
fake.name()

'Seth Berry'

In [9]:
fake.address()

'3925 Martinez Keys Suite 622\nNorth Andreachester, SC 95834'

In [10]:
fake.text()

'Director white would their kitchen commercial responsibility. Man policy police.\nIncluding cover population nor adult standard nice see. Establish happen continue general.'

In [17]:
import os

root_dir = './fake_text_data'
os.makedirs(root_dir, exist_ok=True)

num_folders = 3
files_per_folder = 5

for i in range(1, num_folders + 1):
    folder_path = os.path.join(root_dir, f'Area{i}')
    os.makedirs(folder_path, exist_ok=True)
    
    for j in range(1, files_per_folder + 1):
        file_path = os.path.join(folder_path, f'file{j}.txt')
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write('Name;Address;Description\n')
            for _ in range(10):
                name = fake.name()
                address = fake.address().replace('\n', ', ')
                description = fake.text()[:20]
                file.write(f'{name};{address};{description}\n')


# Text to Csv

## Testing

In [21]:
data = []
with open('./fake_text_data/Area1/file1.txt','r') as f:
    content = f.read().strip()
    lines = content.split('\n')
    for line in lines:
        data.append(line.split(';'))
print(data)
        

[['Name', 'Address', 'Description'], ['Ariana Carroll', '424 Anderson Key, West Kathyshire, IA 93195', 'Food store less buy '], ['Natalie Jones', '2119 John Plains, New Sydney, MN 31049', 'Maintain although ad'], ['Robert Salinas', '04698 Bullock Villages, Keithhaven, OH 88916', 'Investment hold actu'], ['Emily Frey', '440 Marc Bypass Apt. 486, Lake Cody, DC 67987', 'Operation sister awa'], ['Monica Howard', '96397 Hill Tunnel, New Robert, PR 55704', 'Sing what change his'], ['Olivia Mitchell', '1144 Kevin Inlet Apt. 826, Oliverstad, MS 80819', 'Also standard firm u'], ['Julia Duncan', 'Unit 0065 Box 7041, DPO AA 33149', 'Unit simple popular '], ['Elizabeth Richardson', 'USNV Vasquez, FPO AA 39192', 'Huge support despite'], ['Kelsey Reilly', '99844 Travis Ford, Katieville, UT 15262', 'Late benefit defense'], ['Michael Mcneil', '41798 Christina Meadows Suite 691, Port Pamela, WY 24595', 'Most ability role us']]


In [26]:
print(os.stat('./output.csv').st_size)

0


In [46]:
[print(r,f) for r,d,f in os.walk('./fake_text_data')]

./fake_text_data []
./fake_text_data/Area3 ['file2.txt', 'file4.txt', 'file5.txt', 'file1.txt', 'file3.txt']
./fake_text_data/Area1 ['file2.txt', 'file4.txt', 'file5.txt', 'file1.txt', 'file3.txt']
./fake_text_data/Area2 ['file2.txt', 'file4.txt', 'file5.txt', 'file1.txt', 'file3.txt']


[None, None, None, None]

In [48]:
for subdir,_,files in os.walk('./fake_text_data'):
    for file in files:
        file_path = os.path.join(subdir,file)
        txt_to_csv(file_path,'./output.csv')

In [56]:
import os 
import csv
from tqdm import tqdm

root_dir = './fake_text_data'

output_csv = './output.csv'

def txt_to_csv(file_path,output_csv_path):
    """
    Process txt file and append its content to the given csv.

    Parameters
    ----------
    file_path : str
        The path of txt file.
    output_csv_path : str
        The path of csv file.
    """
    with open(output_csv_path,'a',newline='') as csvfile:
        csvwriter = csv.writer(csvfile)

        with open(file_path,'r') as f:
            content = f.read().strip()
            if os.stat('./output.csv').st_size == 0:
                startIndex = 0
            else:
                startIndex = 1
            lines = content.split('\n')[startIndex:]
            for line in lines:
                csvwriter.writerow(line.split(';'))


def files_to_csv(root_directory_path,output_csv_path):
    """
    Aggregate multiple txt files into single csv file.

    Parameters
    ----------
    root_directory_path : str
        The path to the parent directory containing the txt files.
    output_csv_path : str
        The path to the output csv to be generated.
    """
    for subdir,_,files in os.walk(root_directory_path):
        for file in tqdm(files):
            file_path = os.path.join(subdir,file)
            txt_to_csv(file_path,output_csv_path)

In [88]:
files_to_csv('./fake_text_data','./output.csv')

0it [00:00, ?it/s]
100%|██████████| 5/5 [00:00<00:00, 4154.42it/s]
100%|██████████| 5/5 [00:00<00:00, 4901.03it/s]
100%|██████████| 5/5 [00:00<00:00, 4245.25it/s]


# Output csv using pandas

In [72]:
import pandas as pd

In [89]:
df = pd.read_csv('./output.csv')

In [80]:
df.head(10)

Unnamed: 0,Name,Address,Description
0,Kenneth Cabrera,"PSC 0090, Box 6881, APO AP 95570",Minute itself plan.
1,Megan Horn,"188 Morris Harbors Apt. 069, Williamton, RI 59074",Under every local gr
2,Janet Flores,"PSC 2649, Box 2960, APO AP 84622",Full young form whol
3,Richard Williams,"47886 Dillon Shores, Larsonland, DC 17078",Exactly behind sense
4,Kimberly Murray,"433 Rogers Squares, East Rebekahland, VA 96291",Five process find sp
5,Victoria Parrish,"59774 Brian Radial, East Jenniferfort, SD 57644",Model ever argue dec
6,Tracy Hall,"0147 Taylor Parkway, East Robert, LA 81711",World magazine many
7,Jesse Fitzgerald,"05705 Elizabeth Flats Suite 101, Hamptoncheste...",What we deal drop th
8,Daniel Rose,"8618 Kelley Spur, Anthonyshire, MI 95093",There stage field di
9,Theresa Garrett,"06216 Lawrence Mountains Suite 223, East Amand...",Hotel event protect


In [81]:
df.columns

Index(['Name', 'Address', 'Description'], dtype='object')

In [86]:
df['Name'].unique().shape

(150,)

In [84]:
df['Name'].shape

(151,)

In [87]:
df[df.duplicated(['Name'],keep=False)]

Unnamed: 0,Name,Address,Description
10,Anthony Smith,"65429 Brenda Knoll Apt. 903, Huffberg, NE 51401",Ground benefit dream
40,Anthony Smith,"097 Rebecca Gateway, Woodwardshire, LA 27235",Customer necessary c


In [90]:
df[df['Name']=='Ariana Carroll']

Unnamed: 0,Name,Address,Description
81,Ariana Carroll,,Food store less buy
