In [1]:
import pandas as pd
import seaborn as sns
import plotly
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, clear_output


In [2]:
df = pd.read_csv("../data/subscriptions.csv")
email = 'Customer Email'
name = 'Customer Name'
c_id = 'Customer ID'

In [3]:
def contains_shir(row):
    # exception
    if row[email] == 'kshirjarohannaik@gmail.com':
        return False
    elif 'shir' in row[email]:
        return True
    elif 'shir' in row[name]:
        return True
    else:
        return False

In [4]:
## to detect duplicates & shirs - need to lower

print(f"before removing {df.shape = }")

df[email] = df[email].str.lower()
df[name] = df[name].str.lower()
shir_mask = df.apply(contains_shir, axis=1)
print(f"{shir_mask.sum() = }")
df = df[~shir_mask]
print(f"after removing {df.shape = }")



before removing df.shape = (283, 17)
shir_mask.sum() = 13
after removing df.shape = (270, 17)


In [5]:
dup_e = df[email].duplicated(keep=False)
print(f"{dup_e.sum() = }")
dup_n = df[name].duplicated(keep=False)
print(f"{dup_n.sum() = }")
dup_c = df[c_id].duplicated(keep=False)
print(f"{dup_c.sum() = }")


dup_e.sum() = 85
dup_n.sum() = 85
dup_c.sum() = 49


### for duplicated names & emails, they are 1-1 unique

In [9]:
both_dup_df = df[dup_e & dup_n][[name, email]]#.sort_values(by = identifiers[0])
print(f"{both_dup_df.shape = }")
print(f"{both_dup_df[email].nunique() = }")
print(f"{both_dup_df[name].nunique() = }")
print((both_dup_df.groupby(email)[name].nunique() == 1).all())
print((both_dup_df.groupby(name)[email].nunique() == 1).all())



both_dup_df.shape = (80, 2)
True
True
both_dup_df[email].nunique() = 34
both_dup_df[name].nunique() = 34


In [8]:
dup_email_only = df[dup_e & ~dup_n][[name, email]]
print(f"{dup_email_only.shape = }")
dup_name_only = df[~dup_e & dup_n][[name, email]]
print(f"{dup_name_only.shape = }")


dup_email_only.sort_values(by = email).to_csv("name_unique_email_duplications.csv", index = False)
dup_name_only.sort_values(by = name).to_csv("email_unique_name_duplications.csv", index = False)

dup_email_only.shape = (5, 2)
dup_name_only.shape = (5, 2)


## Make duplicates analysis

In [10]:
# Convert relevant date columns to datetime
df['Start Date (UTC)'] = pd.to_datetime(df['Start Date (UTC)'])
df['Canceled At (UTC)'] = pd.to_datetime(df['Canceled At (UTC)'])

# Extract the month and year
df['start_month'] = df['Start Date (UTC)'].dt.to_period('M')
df['cancel_month'] = df['Canceled At (UTC)'].dt.to_period('M')


times = ['Start Date (UTC)', 'Canceled At (UTC)']
info_cols = ['start_month', 'cancel_month', "Status"]

resolved_rows = []

In [11]:
df['customer_id'] = df[email] + '-' + df[name]
# duplicated 
dup_customer = df['customer_id'].duplicated(keep=False)
print(f"{dup_customer.sum() =  }")
duplicated_customers_list = df[dup_customer]['customer_id'].unique()

dup_customer.sum() =  80


In [14]:
# Wrapper Output outside the function, so it persists across calls
output_area = widgets.Output()

def review_customer(index=0):
    if index >= len(duplicated_customers_list):
        with output_area:
            clear_output(wait=True)
            print("✅ Done reviewing all duplicates.")
        return

    customer = duplicated_customers_list[index]
    df_customer = df[df["customer_id"] == customer][info_cols]

    # Create dropdown and button inside this function
    dropdown = widgets.Dropdown(
        options=df_customer.index.tolist(),
        description='Pick index:',
        disabled=False,
    )

    button = widgets.Button(description="Confirm")

    def on_button_clicked(b):
        resolved_rows.append(df.loc[dropdown.value])
        # Clear and move to next customer
        review_customer(index + 1)

    button.on_click(on_button_clicked)

    with output_area:
        clear_output(wait=True)
        print(f"Reviewing customer {index + 1}/{len(duplicated_customers_list)}")
        print(f"{customer = }")
        display(df_customer)
        display(dropdown, button)

# Start the process
display(output_area)
review_customer(0)


Output()

In [31]:
info_cols = ['customer', 'Start Date (UTC)', 'Canceled At (UTC)']

In [32]:
pd.concat([i[info_cols] for i in resolved_rows])

customer                      azlinayeo@gmail.com-azlina yeo
Start Date (UTC)                         2025-07-13 17:15:00
Canceled At (UTC)                        2025-07-14 00:55:00
customer                    yingshijinb@gmail.com-yingshi ji
Start Date (UTC)                         2025-07-09 23:00:00
Canceled At (UTC)                                        NaT
customer                   moshed88@hotmail.com-moshe dabach
Start Date (UTC)                         2025-07-06 23:00:00
Canceled At (UTC)                                        NaT
customer                            lukyip27@gmail.com-y luk
Start Date (UTC)                         2025-07-06 16:40:00
Canceled At (UTC)                                        NaT
customer             anirudhmvr@gmail.com-anirudh manchiraju
Start Date (UTC)                         2025-06-17 20:53:00
Canceled At (UTC)                                        NaT
dtype: object

In [30]:
pd.DataFrame(resolved_rows[0].values)

RangeIndex(start=0, stop=1, step=1)

In [None]:
# the customer id is redundent
print((dup_c == (dup_e & dup_c)).all())
print((dup_c == (dup_n & dup_c)).all())
df = df.drop(c_id, axis=1)

In [14]:
duplicated_customers_list

array(['azlinayeo@gmail.com-azlina yeo',
       'yingshijinb@gmail.com-yingshi ji',
       'moshed88@hotmail.com-moshe dabach', 'lukyip27@gmail.com-y luk',
       'anirudhmvr@gmail.com-anirudh manchiraju',
       'svithya04@yahoo.com-srivithya  kannathasan ',
       'tinateotia@nhs.net-tina teotia das',
       'marshacrosby@gmail.com-miss m l crosby',
       'mcbride.alan@gmail.com-mr alan mcbride',
       'arun1787@gmail.com-mr a thillaisundaram',
       'jeojeo0328@hotmail.com-mrs shi zhou',
       'marcela.bletzer@gmail.com-mrs marcela bletzer',
       'hgajeens@hotmail.com-hilary beck',
       'b.armendariz@ucl.ac.uk-beatriz armendariz',
       'oana.cicalau@yahoo.com-oana cicalau',
       'seanmaccann@gmail.com-sean mac cann',
       'ethanhill116@gmail.com-ethan hill',
       'jayapriyathanaraj@hotmail.com-jayapriya thanaraj',
       'dunnioiyari@gmail.com-dunni omebere-iyari',
       'anya@rowanmail.me-anna rowan',
       'peravali.padma01@gmail.com-mrs p peravali',
       'deex