In [39]:
import pandas as pd
import numpy as np
import random
from faker import Faker

In [40]:
# Number of rows
n = 1000

# Generate sample data for the modified banking database

customer_ids = set()
while len(customer_ids) < n:
    customer_id = str(np.random.randint(1000, 9999))
    customer_ids.add(customer_id)

# Convert set to a list
customer_ids_list = list(customer_ids)

# Account Number (Assuming a 10-digit format)
account_ids = [str(random.randint(1000000000, 9999999999)) for _ in range(n)]

# Create a Faker instance
fake = Faker()

# Generate possible duplicate names
full_names = [fake.name() for _ in range(n)]

# Generate card type data
Acct_status = ['Active', 'Inactive', 'Dormant']
Acct_status_data = np.random.choice(Acct_status, n)

# Balance (Random balance amounts between 100 and 1000000 dollars)
balance_data = np.round(np.random.uniform(1, 10000000, n), 2)

# Generate ATM card numbers
#atm_card_numbers = (np.random.uniform(1, 10000000, n))

# Generate ATM card numbers
atm_card_numbers = np.round(np.random.uniform(1000000000000000, 9999999999999999, n))
masked_atm_card_numbers = [f"{str(int(card_number))[:-8]}****{str(int(card_number))[-4:]}" for card_number in atm_card_numbers]

# Generate 5-digit Transaction IDs
transaction_ids = set()
while len(transaction_ids) < n:
    transaction_id = str(np.random.randint(10000, 99999))
    transaction_ids.add(transaction_id)
transaction_ids_list = list(transaction_ids)

# Generate Transaction Amounts (Random amounts between -5000 and +5000 dollars, rounded to two decimal places)
transaction_amounts = np.round(np.random.uniform(1000, 10000, n), 2)
transaction_dates = [fake.date_between(start_date="-10y", end_date="today").isoformat() for _ in range(n)]
transaction_types = ['Deposit', 'Withdrawal', 'Transfer']
transaction_type_data = np.random.choice(transaction_types, n)

# Generate Credit Scores (assuming a normal distribution with a mean of 700 and standard deviation of 50)
credit_scores = np.random.normal(loc=700, scale=50, size=n)
# Round the scores to whole numbers
credit_scores = np.round(credit_scores).astype(int)

# Generate Credit Risk Ratings
credit_risk_ratings = np.random.choice(['Low', 'Medium', 'High'], n)

# Generate date of births data
birth_dates = []
for _ in range(n):
    # Assume current year is 2023
    year = random.randint(1953, 2005)  # Random year between 1953 and 2005 for ages 18 to 70
    month = random.randint(1, 12)
    day = random.randint(1, 28)  # Assume a maximum of 28 days for simplicity

    birth_dates.append(f"{year}-{month:02d}-{day:02d}")

# Generate fake email addresses using the Faker library
email_addresses = [fake.email() for _ in range(n)]
# Create DataFrame without index and with Customer ID
df = pd.DataFrame({
    'Customer_ID': customer_ids_list,
    'Account_Number': account_ids,
    'Account_Name': full_names,
    'Account_Status': Acct_status_data,
    'Balance': balance_data,
    'ATM_Card_Number': masked_atm_card_numbers,
    'Transaction_Ref': transaction_ids_list,
    'Trnx_Amount': transaction_amounts,
    'Trnx_Date': transaction_dates,
    'Trnx_Type': transaction_type_data,
    'Credit_Scores': credit_scores,
    'Risk_Ratings': credit_risk_ratings,
    'Date_of_Birth': birth_dates,
    'Email_Address': email_addresses,
})
df.set_index(['Customer_ID'], inplace=True)
# Randomly select 120 unique row indices
n_points = 300
random_row_indices = np.random.choice(df.index, n_points, replace=False)

# Randomly select one of the specified columns for each selected row index
columns_to_set_nan = ['Credit_Scores', 'Date_of_Birth', 'Email_Address']
random_columns_for_rows = np.random.choice(columns_to_set_nan, n_points)

# Set NaN values in the DataFrame
for i in range(n_points):
    df.at[random_row_indices[i], random_columns_for_rows[i]] = np.nan

In [41]:
df

Unnamed: 0_level_0,Account_Number,Account_Name,Account_Status,Balance,ATM_Card_Number,Transaction_Ref,Trnx_Amount,Trnx_Date,Trnx_Type,Credit_Scores,Risk_Ratings,Date_of_Birth,Email_Address
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1241,3968562206,Nancy Campbell,Inactive,7070141.69,29394641****0472,66155,2631.60,2020-04-18,Deposit,695.0,Medium,1982-08-15,scott60@example.org
1158,4855068864,Sean Curtis,Inactive,4434883.33,33905178****5534,21042,4715.89,2014-12-05,Withdrawal,653.0,Low,1965-03-22,jackjohnson@example.org
1377,3051457819,Michael Black,Active,6275201.65,15092345****8343,61116,8280.14,2017-06-05,Withdrawal,752.0,Medium,1970-09-07,sherry79@example.com
7736,6536737005,Tony Atkins,Inactive,1626574.59,84843552****8492,77950,1328.95,2020-07-25,Transfer,750.0,Medium,1959-12-03,gonzalezjessica@example.net
9539,3976994777,Erika Hutchinson,Inactive,7752954.91,75131498****9181,26025,7749.03,2020-01-07,Deposit,718.0,High,1995-07-04,brooksashley@example.org
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8062,6553906896,Gina Horton,Inactive,4104975.58,58203601****3150,17051,5973.59,2017-11-11,Transfer,720.0,Medium,1956-12-17,hduncan@example.com
2607,5814026272,Katherine Charles DDS,Dormant,4639891.12,63234802****2433,81749,7890.90,2021-12-25,Deposit,,Low,1973-02-05,jose93@example.com
2383,7667262662,Denise Morris,Active,9643410.72,15406829****1903,97884,7320.62,2014-05-03,Withdrawal,624.0,Low,1967-01-12,hatfieldbradley@example.net
4531,9248140160,Janet Mullen,Dormant,2255739.81,49521902****5396,37067,6452.64,2021-08-31,Deposit,661.0,High,1960-07-17,matthewsdaniel@example.net


In [42]:
# Information on Seller relation
Acct_details = df[['Account_Name', 'Account_Status', 'Balance', 'ATM_Card_Number', 'Risk_Ratings']]
Cust_details = df[['Account_Number', 'Date_of_Birth', 'Email_Address', 'Credit_Scores']]
Trnx_details = df[['Transaction_Ref', 'Trnx_Amount', 'Trnx_Date', 'Trnx_Type']]
#write the dataframe to csv files
Acct_details.to_csv('Account_Details.csv')
Cust_details.to_csv('Customer_Details.csv')
Trnx_details.to_csv('Transaction_Details.csv')