# Creating Raw Data
Here, I have create a raw table that includes:
*   Customers
*   Subscriptions
*   Transactions
*   Product Usage
*   Campaign Exposure

This is the foundation of the entire project.







# Import Pandas, NumPy and Faker


*   **Pandas:**

    Pandas is used to work with tables of data.

    It helps to clean, analyze and organize data easily.

*   **Numpy:**

    Numpy is used for fast mathematical calculations on numbers and arrays.

    It is used for statistics, machine learning and scientific computing.


*   **Faker:**

    Faker is used to generate fake data like name, email and addresses.

    It is useful for tresting programs and creating sample datasets.




In [1]:
!pip install pandas numpy faker

Collecting faker
  Downloading faker-40.4.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.4.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-40.4.0


In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()
np.random.seed(42)

# Creating a Customers Table

In [3]:
n_customers = 5000

customers = pd.DataFrame({
    "customer_id": range(1, n_customers+1),
    "signup_date": pd.to_datetime("2023-01-01") +
                   pd.to_timedelta(np.random.randint(0, 365, n_customers), unit="D"),
    "country": np.random.choice(["US","UK","India","Canada"], n_customers),
    "plan_type": np.random.choice(["Basic","Standard","Premium"], n_customers, p=[0.4,0.4,0.2]),
    "device": np.random.choice(["iOS","Android","Web"], n_customers)
})

customers.head()

Unnamed: 0,customer_id,signup_date,country,plan_type,device
0,1,2023-04-13,UK,Standard,Android
1,2,2023-12-15,Canada,Basic,iOS
2,3,2023-09-28,Canada,Standard,Android
3,4,2023-04-17,UK,Premium,Android
4,5,2023-03-13,India,Basic,Android


# Creating a Subscriptions Table

In [4]:
subscriptions = customers.copy()

subscriptions["monthly_price"] = subscriptions["plan_type"].map({
    "Basic":8,
    "Standard":12,
    "Premium":18
})

subscriptions["is_active"] = np.random.choice([0,1], len(subscriptions), p=[0.2,0.8])

subscriptions = subscriptions[[
    "customer_id","signup_date","monthly_price","is_active"
]]

subscriptions.head()

Unnamed: 0,customer_id,signup_date,monthly_price,is_active
0,1,2023-04-13,12,0
1,2,2023-12-15,8,1
2,3,2023-09-28,12,1
3,4,2023-04-17,18,0
4,5,2023-03-13,8,1


# Creating a Transactions Table

In [5]:
transactions_list = []

for cid in customers.customer_id:
    for m in range(np.random.randint(3,12)):
        transactions_list.append([
            cid,
            pd.to_datetime("2024-01-01") + pd.to_timedelta(np.random.randint(0,180),"D"),
            np.random.choice([8,12,18,25])
        ])

transactions = pd.DataFrame(transactions_list, columns=["customer_id","txn_date","amount"])
transactions.head()

Unnamed: 0,customer_id,txn_date,amount
0,1,2024-01-31,18
1,1,2024-04-18,25
2,1,2024-06-06,18
3,2,2024-03-25,18
4,2,2024-02-25,25


# Creating a Usage Table

In [6]:
usage_list = []

for cid in customers.customer_id:
    for d in range(np.random.randint(20,60)):
        usage_list.append([
            cid,
            pd.to_datetime("2024-01-01") + pd.to_timedelta(np.random.randint(0,180),"D"),
            np.random.randint(5,120)
        ])

usage = pd.DataFrame(usage_list, columns=["customer_id","event_date","minutes_used"])
usage.head()

Unnamed: 0,customer_id,event_date,minutes_used
0,1,2024-01-31,112
1,1,2024-06-13,106
2,1,2024-02-16,75
3,1,2024-06-11,95
4,1,2024-06-02,60


# Creating a Campaign Exposure Table

In [8]:
campaigns = []

for cid in customers.customer_id:
    if np.random.rand() < 0.6:
        campaigns.append([
            cid,
            np.random.choice(["Email","Push","Ads"]),
            pd.to_datetime("2024-03-01") + pd.to_timedelta(np.random.randint(0,60),"D"),
            np.random.choice(["control","treatment"])
        ])

campaigns = pd.DataFrame(campaigns, columns=[
    "customer_id","channel","exposure_date","group"
])

campaigns.head()


Unnamed: 0,customer_id,channel,exposure_date,group
0,1,Ads,2024-04-05,treatment
1,2,Push,2024-04-09,treatment
2,4,Email,2024-03-11,treatment
3,8,Email,2024-04-12,control
4,9,Push,2024-04-24,treatment


# Saving all data

In [9]:
customers.to_csv("customers.csv", index=False)
subscriptions.to_csv("subscriptions.csv", index=False)
transactions.to_csv("transactions.csv", index=False)
usage.to_csv("usage.csv", index=False)
campaigns.to_csv("campaigns.csv", index=False)