In [1]:
import pandas as pd
import numpy as np

In [14]:
demo_data = pd.read_csv(
    'data-prep-datasets/demoDetails.csv',
    index_col=0
    )

In [17]:
demo_data.head()
# demo_data.info()
# demo_data.shape
# demo_data.size

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents
1,8260-NGFNY,Female,0.0,No,No
2,2359-QWQUL,Female,0.0,Yes,No
3,6598/RFFVI,Male,0.0,No,No
4,IXSTS-8780,Female,0.0,No,No
5,2674/MIAHT,Female,0.0,No,No


In [16]:
account_details = pd.read_csv(
    'data-prep-datasets/acDetails.txt',
    sep='\t',)
account_details.head()

Unnamed: 0,customerID,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
1,8260-NGFNY,One,Month-to-month,No,Mailed check,25.2,25.2
2,2359-QWQUL,39,One year,Yes,Credit card (automatic),104.7,4134.85
3,6598/RFFVI,2,One year,No,Credit card (automatic),19.3,28.3
4,IXSTS-8780,6,Month-to-month,Yes,Electronic check,90.1,521.3
5,2674/MIAHT,Four,Month-to-month,Yes,Mailed check,80.3,324.2


In [13]:
service_details = pd.read_csv(
    'data-prep-datasets/serviceDetails.csv',
    index_col=0,)
service_details.head()


Unnamed: 0,customerID,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Churn
1,8260-NGFNY,No,No phone service,DSL,No,No,No,No,No,No,Yes
2,2359-QWQUL,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Yes
3,6598/RFFVI,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes
4,IXSTS-8780,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Yes
5,2674/MIAHT,Yes,Yes,Fiber optic,No,Yes,No,No,No,No,No


# Data Wrangling
* merge account_details, demo_details and service_details.
before merging need to make the necessary checks to the data.

* Check for duplicate records and remove them.
* Whether the customer ID is common across all the files.



In [21]:
# Checking for duplicate records using numpy
len(np.unique(
    demo_data['customerID']
))
len(np.unique(
    account_details['customerID']
))
len(np.unique(
    service_details['customerID']
))

250

In [23]:
# checking for duplicate records using pandas
print(
    f"""
    {demo_data.duplicated().sum()}
    {account_details.duplicated().sum()}
    {service_details.duplicated().sum()}
    
    """
)


    1
    1
    1
    
    


In [26]:
demo_data.duplicated(
    subset=['customerID'],
    keep=False
)

1      False
2      False
3      False
4      False
5      False
       ...  
247    False
248    False
249    False
250    False
251     True
Length: 251, dtype: bool

In [28]:
# Check each row using subset and look for duplicates.
# If there are duplicates, then drop the row.
demo_data[demo_data.duplicated(
    subset=['customerID'],
    keep=False)]
account_details[account_details.duplicated(
    subset=['customerID'],
    keep=False)]
service_details[service_details.duplicated(
    subset=['customerID'],
    keep=False)]

Unnamed: 0,customerID,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Churn
46,9067-SQTNS,Yes,No,No,Yes,Yes,No internet service,No internet service,Yes,No internet service,No
251,9067-SQTNS,Yes,No,No,Yes,Yes,No internet service,No internet service,Yes,No internet service,No


In [36]:
# removing the duplicates
#  First occurance of the duplicate will be removed.

demo_data = demo_data.drop_duplicates()
account_details = account_details.drop_duplicates()
service_details = service_details.drop_duplicates()

In [37]:
print(
    f"""
    {demo_data.shape}
{account_details.shape}
{service_details.shape}
    """
)



    (250, 5)
(250, 7)
(250, 11)
    


In [39]:
# Check if two dataframes are equal or not.
demo_data.customerID.equals(
    account_details.customerID)

True

In [41]:
account_details.customerID.equals(
    service_details.customerID)

True

In [42]:
demo_data.customerID.equals(
    service_details.customerID)


True

Joining two data frames based on a condition ie (customerId)

In [44]:
churn = pd.merge(
    demo_data,
    account_details,
    on='customerID',
)
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,8260-NGFNY,Female,0.0,No,No,One,Month-to-month,No,Mailed check,25.2,25.2
1,2359-QWQUL,Female,0.0,Yes,No,39,One year,Yes,Credit card (automatic),104.7,4134.85
2,6598/RFFVI,Male,0.0,No,No,2,One year,No,Credit card (automatic),19.3,28.3
3,IXSTS-8780,Female,0.0,No,No,6,Month-to-month,Yes,Electronic check,90.1,521.3
4,2674/MIAHT,Female,0.0,No,No,Four,Month-to-month,Yes,Mailed check,80.3,324.2


In [46]:
# merge churn with service details
churn = pd.merge(
    churn,
    service_details,
    on='customerID',
)
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,...,PhoneService_y,MultipleLines_y,InternetService_y,OnlineSecurity_y,OnlineBackup_y,DeviceProtection_y,TechSupport_y,StreamingTV_y,StreamingMovies_y,Churn_y
0,8260-NGFNY,Female,0.0,No,No,One,Month-to-month,No,Mailed check,25.2,...,No,No phone service,DSL,No,No,No,No,No,No,Yes
1,2359-QWQUL,Female,0.0,Yes,No,39,One year,Yes,Credit card (automatic),104.7,...,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Yes
2,6598/RFFVI,Male,0.0,No,No,2,One year,No,Credit card (automatic),19.3,...,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes
3,IXSTS-8780,Female,0.0,No,No,6,Month-to-month,Yes,Electronic check,90.1,...,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Yes
4,2674/MIAHT,Female,0.0,No,No,Four,Month-to-month,Yes,Mailed check,80.3,...,Yes,Yes,Fiber optic,No,Yes,No,No,No,No,No


In [47]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 0 to 249
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customerID          250 non-null    object 
 1   gender              250 non-null    object 
 2   SeniorCitizen       245 non-null    float64
 3   Partner             250 non-null    object 
 4   Dependents          250 non-null    object 
 5   tenure              250 non-null    object 
 6   Contract            250 non-null    object 
 7   PaperlessBilling    250 non-null    object 
 8   PaymentMethod       250 non-null    object 
 9   MonthlyCharges      240 non-null    float64
 10  TotalCharges        235 non-null    float64
 11  PhoneService_x      250 non-null    object 
 12  MultipleLines_x     250 non-null    object 
 13  InternetService_x   250 non-null    object 
 14  OnlineSecurity_x    250 non-null    object 
 15  OnlineBackup_x      250 non-null    object 
 16  DevicePr