# Experimenting and EDA for Dataset

In [7]:
import pymysql
import pandas as pd
from dotenv import dotenv_values

In [3]:
# Getting env var
secrets = dotenv_values()

host =secrets['DB_HOST']
name = secrets['DB_NAME']
user = secrets['DB_USER']
pwd = secrets['DB_PWD']


In [14]:
# Connecting to DB

connection = pymysql.connect(
    host = host,
    user = user,
    password = pwd,
    database = name
    )

cursor = connection.cursor()
cursor.execute("SELECT * FROM churn_status")

results = cursor.fetchall()

print(len(results))

cursor.close()
connection.close()

7043


# EDA
Done from local data files

In [15]:
# File paths
ACC_PATH = "data_given/1_account.csv"
ACC_USAGE_USAGE = "data_given/2_account_usage.csv"
CHURN_STATUS_PATH = "data_given/3_churn_status.csv"
CITY_PATH = "data_given/4_city.csv"
CUSTOMER_PATH = "data_given/5_customer.csv"

In [17]:
ACC_DF = pd.read_csv(ACC_PATH)

In [29]:
ACC_DF.head(1)

Unnamed: 0,account_id,customer_id,tenure_months,num_referrals,has_internet_service,internet_type,has_unlimited_data,has_phone_service,has_multiple_lines,has_premium_tech_support,has_online_security,has_online_backup,has_device_protection,contract_type,paperless_billing,payment_method
0,AAJU-HMJLK,0334-ZFJSR,55,0,Yes,Cable,No,Yes,Yes,Yes,Yes,Yes,No,One Year,Yes,Credit Card


In [None]:
def get_unique_values(df:pd.DataFrame)->dict:
    """
    Runs through the columns of a dataframe and prints the unique values of each column. 
    """
    dict_unique_values = {}
    for cols in df.columns:
        dict_unique_values[cols] = df[cols].unique()
    return dict_unique_values

get_unique_values(ACC_DF)

In [49]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

def preprocess_acc_df(df:pd.DataFrame)->pd.DataFrame:
    """
    Preprocesses the account dataframe as below:

    Converts follow columns (yes/no) to 1/0
    - has_internet_service
    - has_phone_service
    - has_unlimited_data
    - has_multiple_lines
    - has_premium_tech_support
    - has_online_security
    - has_online_backup
    - has_device_protection
    - paperless_billing

    Converts follow columns to ordinal values:
    - contract_type (one-year, month-to-month, two-year) => (1,0,2) 

    One-hot encodes following columns:
    - payment_method
    - internet_type

    Scales following columns:
    - tenure_months
    """
    # Creating a new df
    output_df = pd.DataFrame()

    # Copying over account_id	customer_id	tenure_months
    output_df['account_id'] = df['account_id']
    output_df['customer_id'] = df['customer_id']
    output_df['tenure_months'] = df['tenure_months']

    # Converting yes/no to 1/0
    mapping = {'Yes':1, 'No':0}

    output_df['has_internet_service'] = df['has_internet_service'].map(mapping)
    output_df['has_phone_service'] = df['has_phone_service'].map(mapping)
    output_df['has_unlimited_data'] = df['has_unlimited_data'].map(mapping)
    output_df['has_multiple_lines'] = df['has_multiple_lines'].map(mapping)
    output_df['has_premium_tech_support'] = df['has_premium_tech_support'].map(mapping)
    output_df['has_online_security'] = df['has_online_security'].map(mapping)
    output_df['has_online_backup'] = df['has_online_backup'].map(mapping)
    output_df['has_device_protection'] = df['has_device_protection'].map(mapping)
    output_df['paperless_billing'] = df['paperless_billing'].map(mapping)
    label_encoder = LabelEncoder()

    mapping = {'Month-to-Month':0, 'One Year':1, 'Two Year':2}
    output_df['contract_type'] = df['contract_type'].map(mapping)

    # One-hot encoding
    # one_hot_encoder = OneHotEncoder()
    # one_hot_encoded_cols = ["payment_method", "internet_type"]
    # encoded_df = pd.DataFrame(one_hot_encoder.fit_transform(df[one_hot_encoded_cols]))
    # encoded_df.columns = one_hot_encoder.get_feature_names_out(one_hot_encoded_cols)
    # print(encoded_df.head())

    # One-hot encoding
    one_hot_encoder = OneHotEncoder()
    one_hot_encoded_cols = ["payment_method", "internet_type"]
    encoded_features = one_hot_encoder.fit_transform(df[one_hot_encoded_cols])
    encoded_df = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out(one_hot_encoded_cols))
    output_df = pd.concat([output_df, encoded_df], axis=1)

    # Scaling
    scaler = MinMaxScaler()
    output_df['tenure_months'] = scaler.fit_transform(df[['tenure_months']])

    return output_df

In [50]:
parsed_acc_df = preprocess_acc_df(ACC_DF)
parsed_acc_df.head()

Unnamed: 0,account_id,customer_id,tenure_months,has_internet_service,has_phone_service,has_unlimited_data,has_multiple_lines,has_premium_tech_support,has_online_security,has_online_backup,has_device_protection,paperless_billing,contract_type,payment_method_Bank Withdrawal,payment_method_Credit Card,payment_method_Mailed Check,internet_type_Cable,internet_type_DSL,internet_type_Fiber Optic,internet_type_nan
0,AAJU-HMJLK,0334-ZFJSR,0.760563,1,1,0,1,1,1,1,0,1,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,AAKY-HUGJV,6235-VDHOM,0.056338,1,0,1,0,0,0,1,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,AAMB-TJYWC,4006-HKYHO,0.873239,1,1,1,0,1,1,1,1,1,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,AANL-MWPZF,3258-SYSWS,1.0,1,1,1,1,1,1,1,1,0,2,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,AAOS-KANBS,5360-XGYAZ,1.0,1,0,0,0,1,1,1,1,1,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [48]:
ACC_DF.head()

Unnamed: 0,account_id,customer_id,tenure_months,num_referrals,has_internet_service,internet_type,has_unlimited_data,has_phone_service,has_multiple_lines,has_premium_tech_support,has_online_security,has_online_backup,has_device_protection,contract_type,paperless_billing,payment_method
0,AAJU-HMJLK,0334-ZFJSR,55,0,Yes,Cable,No,Yes,Yes,Yes,Yes,Yes,No,One Year,Yes,Credit Card
1,AAKY-HUGJV,6235-VDHOM,5,0,Yes,DSL,Yes,No,No,No,No,Yes,No,Month-to-Month,No,Bank Withdrawal
2,AAMB-TJYWC,4006-HKYHO,63,0,Yes,DSL,Yes,Yes,No,Yes,Yes,Yes,Yes,Two Year,Yes,Bank Withdrawal
3,AANL-MWPZF,3258-SYSWS,72,0,Yes,Fiber Optic,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,No,Bank Withdrawal
4,AAOS-KANBS,5360-XGYAZ,72,1,Yes,DSL,No,No,No,Yes,Yes,Yes,Yes,Two Year,Yes,Bank Withdrawal
