In [None]:
import pandas as pd
import os
import datetime as dt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Definindo os caminhos
folder_input = "/content/drive/Shareddrives/Real_Project_Churn_DSA30/Dataset/Raw/kkboxs"
folder_output = "/content/drive/Shareddrives/Real_Project_Churn_DSA30/Dataset/Bronze"

In [None]:
# Leitura Arquivos
df_members = pd.read_csv(os.path.join(folder_input, 'members_v3.csv'))
df_transactions = pd.read_csv(os.path.join(folder_input, 'transactions.csv'))

### Amostragem Aleatória (Base de Membros)

In [None]:
df_members = df_members[~df_members['gender'].isnull()]

In [None]:
df_members_semple_1 = df_members.sample(frac = 0.1, random_state = 21)
df_members_semple_2 = df_members.sample(frac = 0.1, random_state = 10)
df_members_semple_3 = df_members.sample(frac = 0.1, random_state = 42)


tipo_amostragem = "Amostragem_Aleatória"

print(df_members_semple_1.shape)
print(df_members_semple_2.shape)
print(df_members_semple_3.shape)
df_members_semple_1.to_csv(os.path.join(folder_output, 'Amostra_Clientes', tipo_amostragem, 'members_sample_1.csv'), index = False)
df_members_semple_2.to_csv(os.path.join(folder_output, 'Amostra_Clientes', tipo_amostragem,'members_sample_2.csv'), index = False)
df_members_semple_3.to_csv(os.path.join(folder_output, 'Amostra_Clientes', tipo_amostragem,'members_sample_3.csv'), index = False)

(233997, 6)
(233997, 6)
(233997, 6)


### Validação base de Transações

In [None]:
df_transactions_members = pd.merge(df_transactions, df_members, on='msno', how='inner')

In [None]:
df_transactions_members = df_transactions_members[df_transactions.columns]

In [None]:
df_transactions_members.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,20150930,20160427,0
1,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,20150930,20151128,0
2,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,20150930,20151121,0
3,KN7I82kjY0Tn76Ny95ncqBUVbO7b8AXrOPqZutKpxIM=,21,30,149,149,1,20150930,20151107,0
4,m5ptKif9BjdUghHXXomSezy5ohJiHm85PE13f/3kQaw=,39,30,149,149,1,20150930,20151128,0


### Calculando Churn

In [None]:
time_churn = 30

def calculate_churn(df):
    df = df.sort_values(by=['msno', 'transaction_date', 'membership_expire_date'])
    df['next_transaction_date'] = df.groupby('msno')['transaction_date'].shift(-1)
    df['next_expiration_date'] = df.groupby('msno')['membership_expire_date'].shift(-1)

    df['churn'] = (df['next_transaction_date'].isna()) | (
        (df['next_transaction_date'] - df['membership_expire_date']).dt.days > time_churn)

    return df

In [None]:
df_transactions_members['membership_expire_date'] = pd.to_datetime(df_transactions_members['membership_expire_date'].astype(str), format='%Y%m%d')
df_transactions_members['transaction_date'] = pd.to_datetime(df_transactions_members['transaction_date'].astype(str), format='%Y%m%d')

In [None]:
df_transactions_members['transaction_date'].max()

Timestamp('2017-02-28 00:00:00')

In [None]:
data_base = df_transactions_members['transaction_date'].max() - dt.timedelta(days=time_churn)

In [None]:
data_base

Timestamp('2017-01-29 00:00:00')

In [None]:
df_churn = calculate_churn(df_transactions_members)

In [None]:
df_transactions_churn = df_churn[df_churn['membership_expire_date'] < data_base].drop(columns=['next_transaction_date', 'next_expiration_date'])

In [None]:
df_transactions_churn.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,churn
3046439,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,35,7,0,0,0,2016-09-09,2016-09-14,0,True
682813,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,38,410,1788,1788,0,2015-11-21,2017-01-04,0,True
3495082,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,39,31,149,149,1,2015-01-31,2015-03-19,0,False
6495606,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,39,31,149,149,1,2015-02-28,2015-04-19,0,False
4935949,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,39,31,149,149,1,2015-03-31,2015-05-19,0,False


In [None]:
df_transactions_churn.to_csv(os.path.join(folder_input, 'df_transactions_churn_full.csv'), index=False)

### Amostragem Estratificada

In [None]:
df_clientes_churn = df_transactions_churn.groupby('msno')['churn'].max().reset_index()

In [None]:
df_clientes_churn['churn'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
churn,Unnamed: 1_level_1
True,0.761202
False,0.238798


In [None]:
tipo_amostragem = "Amostragem_Estratificada"
from sklearn.model_selection import StratifiedShuffleSplit

i = 0
for ramdom_state in [21, 10, 42]:
  i += 1
  print(f"Rodada: {i}")
  split = StratifiedShuffleSplit(n_splits=1, test_size=0.8, random_state=ramdom_state)

  for train_idx, _ in split.split(df_clientes_churn, df_clientes_churn['churn']):
      amostra = df_clientes_churn.iloc[train_idx]
      amostra.to_csv(os.path.join(folder_output, 'Amostra_Clientes', tipo_amostragem, f'churn_sample_{i}.csv'), index=False)

  print(f"Tamanho da amostra{i} estratificada: {len(amostra)}")

Rodada: 1
Tamanho da amostra1 estratificada: 186849
Rodada: 2
Tamanho da amostra2 estratificada: 186849
Rodada: 3
Tamanho da amostra3 estratificada: 186849
