# Experimenting and EDA for Dataset

In [7]:
import pymysql
import pandas as pd
from dotenv import dotenv_values

In [3]:
# Getting env var
secrets = dotenv_values()

host =secrets['DB_HOST']
name = secrets['DB_NAME']
user = secrets['DB_USER']
pwd = secrets['DB_PWD']


In [14]:
# Connecting to DB

connection = pymysql.connect(
    host = host,
    user = user,
    password = pwd,
    database = name
    )

cursor = connection.cursor()
cursor.execute("SELECT * FROM churn_status")

results = cursor.fetchall()

print(len(results))

cursor.close()
connection.close()

7043


# EDA
Done from local data files

In [59]:
# File paths
ACC_PATH = "data_given/1_account.csv"
ACC_USAGE_PATH = "data_given/2_account_usage.csv"
CHURN_STATUS_PATH = "data_given/3_churn_status.csv"
CITY_PATH = "data_given/4_city.csv"
CUSTOMER_PATH = "data_given/5_customer.csv"

### General Helper Functions

In [52]:
def get_unique_values(df:pd.DataFrame)->dict:
    """
    Runs through the columns of a dataframe and prints the unique values of each column. 
    """
    dict_unique_values = {}
    for cols in df.columns:
        dict_unique_values[cols] = df[cols].unique()
    return dict_unique_values

In [53]:
def get_NaN_count(df:pd.DataFrame)->dict:
    """
    Returns the number of NaN values for each column in a dictionary.
    """
    nan_count = df.isna().sum().to_dict()
    return nan_count

In [17]:
ACC_DF = pd.read_csv(ACC_PATH)

In [29]:
ACC_DF.head(1)

Unnamed: 0,account_id,customer_id,tenure_months,num_referrals,has_internet_service,internet_type,has_unlimited_data,has_phone_service,has_multiple_lines,has_premium_tech_support,has_online_security,has_online_backup,has_device_protection,contract_type,paperless_billing,payment_method
0,AAJU-HMJLK,0334-ZFJSR,55,0,Yes,Cable,No,Yes,Yes,Yes,Yes,Yes,No,One Year,Yes,Credit Card


In [70]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

def preprocess_acc_df(df:pd.DataFrame)->pd.DataFrame:
    """
    Preprocesses the account dataframe as below:

    Converts follow columns (yes/no) to 1/0
    - has_internet_service
    - has_phone_service
    - has_unlimited_data
    - has_multiple_lines
    - has_premium_tech_support
    - has_online_security
    - has_online_backup
    - has_device_protection
    - paperless_billing

    Converts follow columns to ordinal values:
    - contract_type (one-year, month-to-month, two-year) => (1,0,2) 

    One-hot encodes following columns:
    - payment_method
    - internet_type

    Scales following columns:
    - tenure_months
    """
    # Creating a new df
    output_df = pd.DataFrame()

    # Copying over account_id	customer_id	tenure_months
    output_df['account_id'] = df['account_id']
    output_df['customer_id'] = df['customer_id']

    # Converting yes/no to 1/0
    mapping = {'Yes':1, 'No':0}

    output_df['has_internet_service'] = df['has_internet_service'].map(mapping)
    output_df['has_phone_service'] = df['has_phone_service'].map(mapping)
    output_df['has_unlimited_data'] = df['has_unlimited_data'].map(mapping)
    output_df['has_multiple_lines'] = df['has_multiple_lines'].map(mapping)
    output_df['has_premium_tech_support'] = df['has_premium_tech_support'].map(mapping)
    output_df['has_online_security'] = df['has_online_security'].map(mapping)
    output_df['has_online_backup'] = df['has_online_backup'].map(mapping)
    output_df['has_device_protection'] = df['has_device_protection'].map(mapping)
    output_df['paperless_billing'] = df['paperless_billing'].map(mapping)

    mapping = {'Month-to-Month':0, 'One Year':1, 'Two Year':2}
    output_df['contract_type'] = df['contract_type'].map(mapping)

    # One-hot encoding
    # one_hot_encoder = OneHotEncoder()
    # one_hot_encoded_cols = ["payment_method", "internet_type"]
    # encoded_df = pd.DataFrame(one_hot_encoder.fit_transform(df[one_hot_encoded_cols]))
    # encoded_df.columns = one_hot_encoder.get_feature_names_out(one_hot_encoded_cols)
    # print(encoded_df.head())

    # One-hot encoding
    one_hot_encoder = OneHotEncoder()
    one_hot_encoded_cols = ["payment_method", "internet_type"]
    encoded_features = one_hot_encoder.fit_transform(df[one_hot_encoded_cols])
    encoded_df = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out(one_hot_encoded_cols))
    output_df = pd.concat([output_df, encoded_df], axis=1)

    # Scaling
    scaler = MinMaxScaler()
    output_df['tenure_months'] = scaler.fit_transform(df[['tenure_months']])

    return output_df

In [50]:
parsed_acc_df = preprocess_acc_df(ACC_DF)
parsed_acc_df.head()

Unnamed: 0,account_id,customer_id,tenure_months,has_internet_service,has_phone_service,has_unlimited_data,has_multiple_lines,has_premium_tech_support,has_online_security,has_online_backup,has_device_protection,paperless_billing,contract_type,payment_method_Bank Withdrawal,payment_method_Credit Card,payment_method_Mailed Check,internet_type_Cable,internet_type_DSL,internet_type_Fiber Optic,internet_type_nan
0,AAJU-HMJLK,0334-ZFJSR,0.760563,1,1,0,1,1,1,1,0,1,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,AAKY-HUGJV,6235-VDHOM,0.056338,1,0,1,0,0,0,1,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,AAMB-TJYWC,4006-HKYHO,0.873239,1,1,1,0,1,1,1,1,1,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,AANL-MWPZF,3258-SYSWS,1.0,1,1,1,1,1,1,1,1,0,2,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,AAOS-KANBS,5360-XGYAZ,1.0,1,0,0,0,1,1,1,1,1,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0


### Account Usage 

In [60]:
ACC_USAGE_DF = pd.read_csv(ACC_USAGE_PATH)

In [61]:
ACC_USAGE_DF.head(1)

Unnamed: 0,account_id,avg_long_distance_fee_monthly,total_long_distance_fee,avg_gb_download_monthly,stream_tv,stream_movie,stream_music,total_monthly_fee,total_charges_quarter,total_refunds
0,AAJU-HMJLK,35.38,1945.9,13,No,No,No,66.05,3462.1,44.53


In [66]:
def preprocess_acc_usage_df(df:pd.DataFrame)->pd.DataFrame:
    """
    Preprocess the account usage dataframe as below:
    Scale the following columns:
    - avg_long_distance_fee_monthly
    - total_long_distance_fee
    - avg_gb_download_monthly
    - total_monthly_fee
    - total_chargers_quarter
    - total_refunds
    Converts following col to 1/0:
    - stream_move
    - stream_music
    - stream_tv
    """
    # Create new df
    output_df = pd.DataFrame()

    # Copying over acc_id
    output_df['account_id'] = df['account_id']

    # Converting yes/no to 1/0
    mapping = {'Yes':1, 'No':0}

    output_df['stream_movie'] = df['stream_movie'].map(mapping)
    output_df['stream_music'] = df['stream_music'].map(mapping)
    output_df['stream_tv'] = df['stream_tv'].map(mapping)

    # Scaling
    scaler = MinMaxScaler()
    output_df['avg_long_distance_fee_monthly'] = scaler.fit_transform(df[['avg_long_distance_fee_monthly']])
    output_df['total_long_distance_fee'] = scaler.fit_transform(df[['total_long_distance_fee']])
    output_df['avg_gb_download_monthly'] = scaler.fit_transform(df[['avg_gb_download_monthly']])
    output_df['total_monthly_fee'] = scaler.fit_transform(df[['total_monthly_fee']])
    output_df['total_charges_quarter'] = scaler.fit_transform(df[['total_charges_quarter']])
    output_df['total_refunds'] = scaler.fit_transform(df[['total_refunds']])

    return output_df

In [67]:
parsed_acc_usage_df = preprocess_acc_usage_df(ACC_USAGE_DF)
parsed_acc_usage_df.head()

Unnamed: 0,account_id,stream_movie,stream_music,stream_tv,avg_long_distance_fee_monthly,total_long_distance_fee,avg_gb_download_monthly,total_monthly_fee,total_charges_quarter,total_refunds
0,AAJU-HMJLK,0,0,0,0.707742,0.545877,0.152941,0.475622,0.397334,0.894356
1,AAKY-HUGJV,0,0,0,0.0,0.0,0.305882,0.101493,0.012953,0.0
2,AAMB-TJYWC,1,1,1,0.591718,0.522773,0.552941,0.681095,0.610512,0.0
3,AANL-MWPZF,1,0,1,0.493499,0.498283,0.305882,0.950746,0.903185,0.0
4,AAOS-KANBS,1,1,1,0.0,0.0,0.188235,0.471642,0.536084,0.0


### Churn Status

In [71]:
CHURN_STATUS_DF = pd.read_csv(CHURN_STATUS_PATH)

In [73]:
CHURN_STATUS_DF.head()

Unnamed: 0,customer_id,status,churn_label,churn_category,churn_reason
0,0002-ORFBO,Stayed,No,,
1,0003-MKNFE,Stayed,No,,
2,0004-TLHLJ,Churned,Yes,Competitor,Competitor had better devices
3,0011-IGKFF,Churned,Yes,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Churned,Yes,Dissatisfaction,Network reliability


In [74]:
get_unique_values(CHURN_STATUS_DF)

{'customer_id': array(['0002-ORFBO', '0003-MKNFE', '0004-TLHLJ', ..., '9992-UJOEL',
        '9993-LHIEB', '9995-HOTOH'], dtype=object),
 'status': array(['Stayed', 'Churned', 'Joined'], dtype=object),
 'churn_label': array(['No', 'Yes', nan], dtype=object),
 'churn_category': array([nan, 'Competitor', 'Dissatisfaction', 'Other', 'Price', 'Attitude'],
       dtype=object),
 'churn_reason': array([nan, 'Competitor had better devices', 'Product dissatisfaction',
        'Network reliability', 'Limited range of services',
        'Competitor made better offer', "Don't know",
        'Long distance charges', 'Attitude of service provider',
        'Attitude of support person',
        'Competitor offered higher download speeds',
        'Competitor offered more data',
        'Lack of affordable download/upload speed', 'Deceased', 'Moved',
        'Service dissatisfaction', 'Price too high',
        'Lack of self-service on Website',
        'Poor expertise of online support', 'Extra data c

In [91]:
from category_encoders import BinaryEncoder
from sklearn.preprocessing import LabelEncoder

def preprocess_churn_df(df:pd.DataFrame)->pd.DataFrame:
    """
    Preprocess the churn status dataframe as below:
    Convert following col to 1/0:
    - churn_label

    Binary encodes:
    - churn_category
    
    Label-encodes:
    - status

    Drops the following:
    - churn_reason -> Not planning to do NLP
    """
    # Create new df
    output_df = pd.DataFrame()

    # Copying over acc_id
    output_df['customer_id'] = df['customer_id']

    # Converting yes/no to 1/0
    mapping = {'Yes':1, 'No':0}
    output_df['churn_label'] = df['churn_label'].map(mapping)

    # Binary Encoding
    binary_encoder = BinaryEncoder(cols=['churn_category'])
    binary_encoder.fit_transform(df['churn_category'])
    churn_cat = binary_encoder.transform(df['churn_category'])
    output_df = pd.concat([output_df, churn_cat], axis=1)

    # Label Encoding
    label_encoder = LabelEncoder()
    output_df['status'] = label_encoder.fit_transform(df['status'])

    return output_df




In [96]:
parsed_churn_df = preprocess_churn_df(CHURN_STATUS_DF)
parsed_churn_df.dtypes
check_NaN = get_NaN_count(CHURN_STATUS_DF)
print(check_NaN)


{'customer_id': 0, 'status': 0, 'churn_label': 52, 'churn_category': 5174, 'churn_reason': 5174}


In [97]:
CHURN_STATUS_DF[CHURN_STATUS_DF['churn_label'].isna()]

Unnamed: 0,customer_id,status,churn_label,churn_category,churn_reason
112,0196-VULGZ,Churned,,Other,Deceased
142,0235-KGSLC,Churned,,Other,Moved
251,0378-NHQXU,Churned,,Other,Deceased
385,0568-ONFPC,Churned,,Other,Moved
676,0991-BRRFB,Churned,,Other,Deceased
823,1205-WNWPJ,Churned,,Other,Moved
952,1383-EZRWL,Churned,,Other,Moved
1007,1455-ESIQH,Churned,,Other,Moved
1054,1541-ETJZO,Churned,,Other,Moved
1111,1624-NALOJ,Churned,,Other,Moved
