In [4]:
import pandas as pd
import numpy as np
import random
from faker import Faker

In [5]:
# Number of rows
n = 1000
# Generate sample data for the modified banking database
customer_ids = set()
while len(customer_ids) < n:
    customer_id = str(np.random.randint(1000, 9999))
    customer_ids.add(customer_id)
# Convert set to a list
customer_ids_list = list(customer_ids)
# Account Number (Assuming a 10-digit format)
account_ids = [str(random.randint(1000000000, 9999999999)) for _ in range(n)]
# Create a Faker instance
fake = Faker()
# Generate possible duplicate names
full_names = [fake.name() for _ in range(n)]
# Generate card type data
Acct_status = ['Active', 'Inactive', 'Dormant']
Acct_status_data = np.random.choice(Acct_status, n)
# Balance (Random balance amounts between 100 and 1000000 dollars)
balance_data = np.round(np.random.uniform(1, 10000000, n), 2)
# Generate ATM card numbers
atm_card_numbers = np.round(np.random.uniform(1000000000000000, 9999999999999999, n))
masked_atm_card_numbers = [f"{str(int(card_number))[:-8]}****{str(int(card_number))[-4:]}" for card_number in atm_card_numbers]
# Generate 5-digit Transaction IDs
transaction_ids = set()
while len(transaction_ids) < n:
    transaction_id = str(np.random.randint(10000, 99999))
    transaction_ids.add(transaction_id)
transaction_ids_list = list(transaction_ids)
# Generate Transaction Amounts (Random amounts between -5000 and +5000 dollars, rounded to two decimal places)
transaction_amounts = np.round(np.random.uniform(1000, 10000, n), 2)
transaction_dates = [fake.date_between(start_date="-10y", end_date="today").isoformat() for _ in range(n)]
transaction_types = ['Deposit', 'Withdrawal', 'Transfer']
transaction_type_data = np.random.choice(transaction_types, n)
# Generate Credit Scores (assuming a normal distribution with a mean of 700 and standard deviation of 50)
credit_scores = np.random.normal(loc=700, scale=50, size=n)
# Round the scores to whole numbers
credit_scores = np.round(credit_scores).astype(int)
# Generate Credit Risk Ratings
credit_risk_ratings = np.random.choice(['Low', 'Medium', 'High'], n)

# Generate date of births data
birth_dates = []
for _ in range(n):
    # Assume current year is 2023
    year = random.randint(1953, 2005)  # Random year between 1953 and 2005 for ages 18 to 70
    month = random.randint(1, 12)
    day = random.randint(1, 28)  # Assume a maximum of 28 days for simplicity
    birth_dates.append(f"{year}-{month:02d}-{day:02d}")
# Generate fake email addresses using the Faker library
email_addresses = [fake.email() for _ in range(n)]
# Create DataFrame without index and with Customer ID
df = pd.DataFrame({
    'Customer_ID': customer_ids_list,
    'Account_Number': account_ids,
    'Account_Name': full_names,
    'Account_Status': Acct_status_data,
    'Balance': balance_data,
    'ATM_Card_Number': masked_atm_card_numbers,
    'Transaction_Ref': transaction_ids_list,
    'Trnx_Amount': transaction_amounts,
    'Trnx_Date': transaction_dates,
    'Trnx_Type': transaction_type_data,
    'Credit_Scores': credit_scores,
    'Risk_Ratings': credit_risk_ratings,
    'Date_of_Birth': birth_dates,
    'Email_Address': email_addresses,
})
df.set_index(['Customer_ID'], inplace=True)
# Randomly select 120 unique row indices
n_points = 300
random_row_indices = np.random.choice(df.index, n_points, replace=False)
# Randomly select one of the specified columns for each selected row index
columns_to_set_nan = ['Credit_Scores', 'Date_of_Birth', 'Email_Address']
random_columns_for_rows = np.random.choice(columns_to_set_nan, n_points)
# Set NaN values in the DataFrame
for i in range(n_points):
    df.at[random_row_indices[i], random_columns_for_rows[i]] = np.nan

In [42]:
# Information on Seller relation
Acct_details = df[['Account_Name', 'Account_Status', 'Balance', 'ATM_Card_Number', 'Risk_Ratings']]
Cust_details = df[['Account_Number', 'Date_of_Birth', 'Email_Address', 'Credit_Scores']]
Trnx_details = df[['Transaction_Ref', 'Trnx_Amount', 'Trnx_Date', 'Trnx_Type']]
#write the dataframe to csv files
Acct_details.to_csv('Account_Details.csv')
Cust_details.to_csv('Customer_Details.csv')
Trnx_details.to_csv('Transaction_Details.csv')

In [6]:
df

Unnamed: 0_level_0,Account_Number,Account_Name,Account_Status,Balance,ATM_Card_Number,Transaction_Ref,Trnx_Amount,Trnx_Date,Trnx_Type,Credit_Scores,Risk_Ratings,Date_of_Birth,Email_Address
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6481,3709213964,Deborah Knight,Active,1230271.56,42310826****6494,92651,1623.06,2016-01-30,Deposit,715.0,Medium,1956-04-28,pcarroll@example.org
9509,5310228500,Phillip Duncan,Dormant,1281944.16,94988489****1956,38986,6714.11,2015-08-16,Deposit,,High,1987-07-03,qsmith@example.org
5359,8343323193,Blake West,Active,4130284.20,29427277****8338,56990,2098.09,2022-06-25,Deposit,667.0,Medium,1974-03-12,
3424,7207391255,Kelly Williams,Active,4458152.41,58373279****7090,83102,2272.24,2022-01-18,Withdrawal,680.0,Low,1990-06-16,ahawkins@example.org
8316,9833977637,Maria Robinson,Dormant,8923925.94,25181495****7998,53295,1151.61,2016-09-24,Transfer,768.0,Low,1979-04-06,russell16@example.org
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,6321520090,Kimberly Evans,Inactive,4606024.79,99255167****5228,95628,6852.77,2017-01-10,Withdrawal,651.0,High,,yowen@example.com
7661,7173144859,Michael Poole,Active,3790586.40,98649009****8574,98771,9591.93,2022-08-02,Transfer,741.0,Low,,batesedward@example.org
6673,2136101063,Jacob Cardenas,Active,4181785.51,87393040****1485,54810,5339.51,2022-10-15,Withdrawal,671.0,Medium,1996-01-10,
8041,5024931645,Bruce Diaz,Active,5294542.53,84319929****3792,75636,5344.51,2019-06-03,Deposit,634.0,Medium,1992-02-17,mcarter@example.net
