<a href="https://colab.research.google.com/github/nanna273/data_heart_case_study/blob/main/Data_heart_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import unittest
import scipy.stats as stats

## Synthetic Data Requirements  
There needs to be separate synthetic datasets generated for each charity which capture their differences using Python. This data should be saved as excel files or .csvs. The base data should follow the structure:  
A donor table, with one row of information per donor. The data fields should include:  
* donor id - e.g. unique 12 digit integer (int)
* age (at first donation) (int)
* gender (string)
* postcode index of deprivation (int).  

A donations table with one row per donation. Each donation should link back to a donor in the donor table. The data fields should include:
* donation id (int)
* donor id (int)
* gift date (date)
* gift amount (float)
* attributed channel, e.g. face to face etc (?)
* gift aidable (bool).  

The data itself should cover a 5 year period from July 2019 to June 2024.


## Generate donor table
A donor table, with one row of information per donor. The data fields should include:  
* donor id - e.g. unique 12 digit integer (int)
* age (at first donation) (int)
* gender (string)
* postcode index of deprivation (int).  

In [2]:
def generate_donor_table(
        length, age_parameters, gender_parameters, postcode_parameters
        ):

    # Generate unique donor ids.
    id_max = 10 ** 12 # Donor IDs are 12 digits.
    donor_ids = random.sample(range(0, id_max), length)
    # Convert to string and add leading 0s.
    donor_ids = [str(id).zfill(12) for id in donor_ids]

    # Generate age.
    age_min, age_max = age_parameters
    age = np.random.randint(age_min, age_max, length)

    # Generate gender.
    gender_list, gender_weights = gender_parameters
    gender = np.random.choice(gender_list, length, p=gender_weights)

    # Generate postcode.
    postcode_min, postcode_max = 1, 32844 # Don't forget, needs to be inclusive.
    postcode = np.random.randint(postcode_min, postcode_max+1, length)

    # Convert output to DataFrame.
    data_dict = {'age' : age,
                'gender' : gender,
                'postcode index of need' : postcode
                }
    output_table = pd.DataFrame(data_dict, index=donor_ids)
    return output_table

In [3]:
table_length = 10
age_parameters = [35, 55] # Parameters not yet fully set up.
# 'M' = male, 'F' = female, 'N' = non-binary, 'O' = other/prefer not to say.
gender_parameters = ['M', 'F', 'N', 'O'], [0.45, 0.5, 0.03, 0.02]
postcode_parameters = 0 # Parameters not yet fully set up.
donor_table = generate_donor_table(
    table_length, age_parameters, gender_parameters, postcode_parameters
    )

## Generate donations table
A donations table with one row per donation. Each donation should link back to a donor in the donor table. The data fields should include:
* donation id (int)
* donor id (int)
* gift date (date)
* gift amount (float)
* attributed channel, e.g. face to face etc (?)
* gift aidable (bool).  

The data itself should cover a 5 year period from July 2019 to June 2024.

In [39]:
def generate_donations_table(
        donor_table, length,
        date_parameters, amount_parameters, channel_parameters,
        aidable_parameters
        ):
    # Donation IDs, just an integer in order.
    donation_ids = range(1, length+1)

    # Donor IDs, selected from donor table. Will need to be not random at
    # some point.
    donor_ids = np. random.choice(donor_table.index.to_list(), length)

    # Dates, random.
    date_min, date_max = date_parameters
    time_between_dates = (date_max - date_min).days
    donation_dates = [
        date_min + datetime.timedelta(days=random.randint(0, time_between_dates))
        for _ in range(length)
    ]
    donation_dates.sort()

    # Amounts, random. Will need to be linked to
    amount_min, amount_max = amount_parameters
    amounts = [
        random.uniform(amount_min, amount_max)
        for _ in range(length)
    ]
    amounts = [round(amount, 2) for amount in amounts]

    # Channels, random with distribution.
    channel_list, channel_weights = channel_parameters
    channels = np.random.choice(channel_list, length, p=channel_weights)

    # Aidable, random with distribution.
    aidable_list, aidable_weights = aidable_parameters
    aidables = np.random.choice(aidable_list, length, p=aidable_weights)

    # Convert output to DataFrame.
    data_dict = {
        'donor_id' : donor_ids,
        'donation_date' : donation_dates,
        'donation_amount' : amounts,
        'donation_channel' : channels,
        'donation_aidable' : aidables
    }
    donations_table = pd.DataFrame(data_dict, index=donation_ids)
    return donations_table

In [40]:
# Donor table is defined above
table_length = 40
date_parameters = [datetime.date(2019, 7, 1), datetime.date(2024, 6, 30)]
amount_parameters = [5, 100]
channel_parameters = ['Postal', 'Digital', 'Face to Face'], [0.5, 0.3, 0.2]
aidable_parameters = [True, False], [0.6, 0.4]
generate_donations_table(
    donor_table, table_length, date_parameters, amount_parameters,
    channel_parameters, aidable_parameters
    )

Unnamed: 0,donor_id,donation_date,donation_amount,donation_channel,donation_aidable
1,432840491153,2019-09-28,27.65,Postal,True
2,589470015143,2019-11-24,26.2,Face to Face,True
3,930840835015,2020-01-19,23.95,Postal,True
4,807473855202,2020-01-23,39.2,Postal,False
5,589470015143,2020-06-10,80.48,Postal,False
6,589470015143,2020-08-29,26.99,Digital,True
7,354937391390,2020-09-04,52.72,Postal,True
8,354937391390,2020-09-15,74.7,Digital,False
9,589470015143,2020-09-29,5.01,Digital,True
10,120256318240,2020-11-04,17.27,Digital,True


## Tests

**Donor table**
1. Length of table.
2. Length of donor IDs.
3. Ages in correct range, and integers.
4. Gender is selected from the correct list.
5. Postcode index of need is set up correctly.

**Donations table**
1. Length of table


In [45]:
table_length = 10
age_parameters = [35, 55] # Parameters not yet fully set up.
# 'M' = male, 'F' = female, 'N' = non-binary, 'O' = other/prefer not to say.
gender_parameters = ['M', 'F', 'N', 'O'], [0.45, 0.5, 0.03, 0.02]
postcode_parameters = 0 # Parameters not yet fully set up.
test_donor_table = generate_donor_table(
    table_length, age_parameters, gender_parameters, postcode_parameters
    )

table_length = 40
date_parameters = [datetime.date(2019, 7, 1), datetime.date(2024, 6, 30)]
amount_parameters = [5, 100]
channel_parameters = ['Postal', 'Digital', 'Face to Face'], [0.5, 0.3, 0.2]
aidable_parameters = [True, False], [0.6, 0.4]
test_donations_table = generate_donations_table(
    donor_table, table_length, date_parameters, amount_parameters,
    channel_parameters, aidable_parameters
    )

class test_tables(unittest.TestCase):
    # First check the donor table.

    def test_table_length(self):
        # Check table is correct length.
        self.assertEqual(len(test_donor_table), 10)

    def test_donor_id_length(self):
        # Check the length of all donor IDs in table is 12.
        id_lengths = [len(id) for id in test_donor_table.index]
        id_set = set(id_lengths)
        self.assertEqual(len(id_set), 1)
        self.assertEqual(id_set, {12})

    def test_age(self):
        # Check ages are in the correct range. By using the 'range' function, it
        # is ensured that the ages are integers.
        self.assertTrue(
            set(range(35, 56)).issuperset(set(test_donor_table['age'].unique())))

    def test_gender(self):
        # Check that genders have been selected from the correct set.
        self.assertTrue(
            set(['M', 'F', 'N', 'O']).issuperset(set(test_donor_table['gender'].unique())))

    def test_postcode(self):
        # Check that postcode index of need is within the range.
        self.assertTrue(
            set(range(1, 32845)).issuperset(test_donor_table['postcode index of need'].unique()))

    # Then check the donation table.

# Run unit tests
unittest.main(argv=['first-arg-is-ignored'], exit=False)

.....
----------------------------------------------------------------------
Ran 5 tests in 0.014s

OK


<unittest.main.TestProgram at 0x7d1b7cf290f0>