In [1]:
import numpy as np
import pandas as pd
import math

Code for loading telco-customer-churn data

In [2]:
def removeUnnecessaryAttribute(dataframe,attributeName):
    for aName in attributeName:
        dataframe = dataframe.drop([aName], axis=1)
    return dataframe

In [3]:
def removeEmptyLabel(dataframe,label):
    #https://www.w3resource.com/pandas/dataframe/dataframe-dropna.php
    dataframe = dataframe.dropna(axis=0, subset=[label])
    #https://www.geeksforgeeks.org/python-pandas-dataframe-reset_index/
    dataframe = dataframe.reset_index(drop=True)
    return dataframe

In [4]:
def categoricalReplacingValues(missing_data,label_data):
    bestValues = {}
    for i in range(len(label_data)):
        if not pd.isnull(missing_data[i]):
            if label_data[i] not in bestValues:
                bestValues[label_data[i]] = {}

            set = bestValues[label_data[i]]
            if missing_data[i] not in set:
                set[missing_data[i]] = 0
            set[missing_data[i]] += 1

            bestValues[label_data[i]] = set

    for key in bestValues:
        maximum = max(bestValues[key], key=bestValues[key].get)
        bestValues[key] = maximum

    for i in range(len(label_data)):
        if pd.isnull(missing_data[i]):
            missing_data[i] = bestValues[label_data[i]]

    return missing_data


def continuousReplacingValues(missing_data,label_data):
    bestValues = {}
    for i in range(len(label_data)):
        if not pd.isnull(missing_data[i]):
            if label_data[i] not in bestValues:
                bestValues[label_data[i]] = []
            bestValues[label_data[i]].append(missing_data[i])

    for key in bestValues:
        mean = np.mean(bestValues[key])
        bestValues[key] = mean

    for i in range(len(label_data)):
        if pd.isnull(missing_data[i]):
            missing_data[i] = bestValues[label_data[i]]
    return missing_data


def entropy(data):
    classes = {}
    size = len(data)
    for i in range(size):
        if data[i] not in classes:
            classes[data[i]] = 0
        classes[data[i]] += 1

    en = 0.0
    for key in classes:
        probability = (classes[key]*1.0)/size
        en += -1.0*math.log(probability,2.0)*probability
    return en


def information_gain(data,label):
    parent_entropy = entropy(label)
    split = {}
    size = len(data)
    for i in range(size):
        if data[i] not in split:
            split[data[i]] = []
        split[data[i]].append(label[i])

    child_entropy = 0.0
    for key in split:
        child_size = len(split[key])
        child_entropy += (child_size*1.0*entropy(split[key]))/size

    return parent_entropy-child_entropy


def binarization(data,label):
    zipped = zip(data.copy(),label.copy())
    sorted_data = [x for x,_ in sorted(zipped)]
    sorted_label = [x for _,x in sorted(zipped)]
    last_data = sorted_data[0]-5.1
    size = len(data)
    best_threshold = None
    best_info_gain = None
    bin_array = []
    for i in range(size):
        bin_array.append(1)

    for i in range(size+1):
        if(i==size):
            threshold = last_data+5.1
        else:
            threshold = (last_data+sorted_data[i])/2.0
            last_data = sorted_data[i]
        if(i>0):
            bin_array[i-1] = 0
        info_gain = information_gain(bin_array,sorted_label)
        if best_info_gain is None or info_gain>best_info_gain:
            best_info_gain = info_gain
            best_threshold = threshold

    for i in range(size):
        if data[i]<=best_threshold:
            data[i] = 0
        else:
            data[i] = 1

    return data

#data = [70,90,85,60,75,220,95,125,100,120]
#label = ['NO','YES','YES','NO','NO','NO','YES','NO','NO','NO']
#print data
#print binarization(data,label)


def shuffle(df, n=1, axis=0):
    df = df.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df


In [5]:
def loadTelcoData():
    dataframe = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv', delimiter=",")
    #non_categorical =['tenure','MonthlyCharges','TotalCharges']
    #non_categorical = []
    label = 'Churn'

    #removing unnecessary attribute
    unnecessaryAttributeName = ['customerID']
    dataframe = removeUnnecessaryAttribute(dataframe,unnecessaryAttributeName)
    
    '''
    https://cs.ccsu.edu/~markov/ccsu_courses/DataMining-3.html
    Ignore the tuple: usually done when class label is missing.
    '''
    print(dataframe.shape)
    dataframe = removeEmptyLabel(dataframe,label)
    print(dataframe.shape)
    
    non_categorical =['tenure','MonthlyCharges','TotalCharges']
    for column in dataframe:
        #dataframe[column] = dataframe[column].replace("\s*", pd.np.nan, regex=True)
        print(dataframe[column].isnull().sum())
        if dataframe[column].isnull().sum():
            #print dataframe[column].values
            if column not in non_categorical:
                dataframe[column] = \
                    pd.Series(categoricalReplacingValues(dataframe[column].values,dataframe[label].values))
            else:
                dataframe[column] = \
                    continuousReplacingValues(dataframe[column].values,dataframe[label].values)
            print (column,' has missing values')
            #print dataframe[column].isnull().sum()

        if column in non_categorical:
            print (column,'is continuous')
            dataframe[column] = \
                pd.Series(binarization(dataframe[column].values,dataframe[label].values))

In [6]:
loadTelcoData()

(7043, 20)
(7043, 20)
0
0
0
0
0
tenure is continuous


IndexError: list index out of range