# Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import normalize
from numpy import log2 as log
import pprint
eps = np.finfo(float).eps

# DataLoader

In [2]:
#universal_data_loader
def getdataframe(filename):
    data = pd.read_csv(filename)
    return data, (data.columns.values.tolist())[:-1], (data.columns.values.tolist())[-1] 

In [3]:
#get_value_by_a_specific_column_no_or_name
def getuniquevalues(dataframe, column):
    return dataframe[column].unique()

# Data Preprocessor

In [4]:
def Telco_processing():
    df = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    df.drop('customerID', axis=1, inplace=True)
    df=df.replace(' ',np.nan).dropna(axis = 0, how = 'any')
    df['SeniorCitizen'] = df['SeniorCitizen'].astype(object)
    
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    
    df.loc[:, 'SeniorCitizen'].replace([1,0], ["Senior","Not-senior"], inplace=True)
    
    df = df.reset_index(drop=True)
    
    df.to_csv (r'telco.csv', index = False, header=True)
    

def Credit_processing():
    df = pd.read_csv('creditcardfraud/creditcard.csv')
    df.drop('Time', axis = 1, inplace=True)
    df['Class'] = df['Class'].astype(object)
    df.loc[:, 'Class'].replace([1,0], ["Yes","No"], inplace=True)
    
    
    positive_sample = df[df['Class'] == "Yes"]
#     print(len(positive_sample))
    negative_sample = (shuffle(df[df['Class'] == "No"], random_state=20))[:20000]
#     print(len(negative_sample))
    
    df = shuffle(pd.concat([positive_sample,negative_sample]), random_state=20).reset_index(drop=True)
    df.to_csv (r'creditcard.csv', index = False, header=True)
    
def Adult_processing(number_bin=10):
    
    column = []
    
    for i in range(14):
        column.append("column_"+str(i))
        
    column.append("label")
    
    train_df = pd.read_csv('adult_dataset/adult.data', names = column)
    
    train_df['label'] = train_df['label'].astype(object)
    train_df['column_0'] = train_df['column_0'].astype(float)
   
    train_df.loc[:, 'label'].replace([" <=50K"," >50K"], ["Yes","No"], inplace=True)
    
    #https://stackoverflow.com/a/47915060
    train_df=train_df.replace(' ?',np.nan).dropna(axis = 0, how = 'any')
  
    train_df = train_df.reset_index(drop=True)
    
    test_df = pd.read_csv('adult_dataset/adult.test', names = column)
    test_df = test_df[1:].reset_index(drop=True)

    
    test_df['label'] = test_df['label'].astype(object)
    test_df['column_0'] = test_df['column_0'].astype(float)
   
    test_df.loc[:, 'label'].replace([" <=50K."," >50K."], ["Yes","No"], inplace=True)
    
       
    #https://stackoverflow.com/a/47915060
    test_df=test_df.replace(' ?',np.nan).dropna(axis = 0, how = 'any')
    
    test_df = test_df.reset_index(drop=True)
    
    train_len = len(train_df)
    data = pd.concat([train_df,test_df]).reset_index(drop=True)
    
    
    est = KBinsDiscretizer(n_bins=number_bin, encode='ordinal', strategy='uniform')
#     print(classes,features)

#     fill missing(nan) value with mean
    for i in range(14):
        if(data.dtypes[i] == 'float64' or data.dtypes[i] == 'int64'):
            data[data.keys()[i]] = data[data.keys()[i]].fillna(data[data.keys()[i]].mean())
            data[data.keys()[i]] = est.fit_transform(data[data.keys()[i]].to_numpy().reshape(-1,1))
#             print(data[data.keys()[i]].unique())


    train_df = shuffle(data[0:train_len], random_state=20).reset_index(drop=True)
    test_df = shuffle(data[train_len:], random_state=20).reset_index(drop=True)
    
    train_df.to_csv (r'adult_train.csv', index = False, header=True)
    test_df.to_csv (r'adult_test.csv', index = False, header=True)


Telco_processing()
# Credit_processing()
# Adult_processing()

In [5]:
def datafetcher(filename, dataset_name, number_bin=10 ):
    data, features, classes = getdataframe(filename)
    est = KBinsDiscretizer(n_bins=number_bin, encode='ordinal', strategy='uniform')
#     print(classes,features)

#     fill missing(nan) value with mean
    for i in range(len(features)):
        if(data.dtypes[i] == 'float64' or data.dtypes[i] == 'int64'):
            data[data.keys()[i]] = data[data.keys()[i]].fillna(data[data.keys()[i]].mean())
            data[data.keys()[i]] = est.fit_transform(data[data.keys()[i]].to_numpy().reshape(-1,1))
#             print(data[data.keys()[i]].unique())


    data = data.dropna()
    data = shuffle(data, random_state=20)
#     get train, test split
    unique_val = getuniquevalues(data,classes)
    
    first_df = data[data[classes] == unique_val[0]]
    second_df = data[data[classes] == unique_val[1]]
    
    test_df = pd.concat([first_df[-(int)(0.2*len(first_df)):],second_df[-(int)(0.2*len(second_df)):]])
    test_df = shuffle(test_df, random_state=20).reset_index(drop=True)
    train_df = pd.concat([first_df[:(int)(0.8*len(first_df))],second_df[0:(int)(0.8*len(second_df))]])
    train_df = shuffle(train_df, random_state=20).reset_index(drop=True)       
    
    train_df.to_csv (dataset_name+'_train.csv', index = False, header=True)
    test_df.to_csv (dataset_name+'_test.csv', index = False, header=True)
                                               

# datafetcher('creditcard.csv', 'credit')
datafetcher('telco.csv', 'telco')

# Decision-Tree

In [6]:
# https://medium.com/@pytholabs/decision-trees-from-scratch-using-id3-python-coding-it-up-6b79e3458de4

def find_entropy(df, classes):
    entropy = 0
    values = df[classes].unique()
    for value in values:
        fraction = df[classes].value_counts()[value]/len(df[classes])
        entropy += -fraction*log(fraction)
    return entropy

def DiscretizedEntropy(df, attribute, Class):
    target_variables = df[Class].unique()  
    variables = df[attribute].unique()    
    entropy2 = 0
    final_entropy = 20000
    final_var = -10000
    
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            
            # others yes/no
            num = len(df[attribute][df[attribute] > variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute] > variable])
            fraction = num/(den+eps)
            fraction2 = den/len(df[attribute])
            entropy += -fraction2*fraction*log(fraction+eps)
            
            # now yes/no
            num = len(df[attribute][df[attribute] <= variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute] <= variable])
            fraction = num/(den+eps)
            fraction2 = den/len(df[attribute])
            entropy += -fraction2*fraction*log(fraction+eps)
            
        if(entropy <= final_entropy):
            final_entropy, final_var = entropy, variable
    return abs(final_entropy), final_var

def find_entropy_attribute(df, attribute, Class):
    target_variables = df[Class].unique()  
    variables = df[attribute].unique()   
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        # total object in this attribute divided by total object in parent attribute    
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2), -1


def find_winner(df, features, classes):
    partition_att = []
    id3 = []
    keys = []
    df_entropy = find_entropy(df, classes)
    
       
    for key in features:
        if df.dtypes[(key)] == 'object':
            final_entropy, final_var = find_entropy_attribute(df, key, classes)
        else:
            final_entropy, final_var = DiscretizedEntropy(df, key, classes)

        id3.append(df_entropy - final_entropy)
        partition_att.append(final_var)
        keys.append(key)
        
    index = np.argmax(id3)
#     print(keys[index], partition_att[index])
    
    return keys[index], partition_att[index]

def get_object_subtable(dataframe, attribute, variable):
    return dataframe[dataframe[attribute] == variable].reset_index(drop=True)

def get_discretized_subtable(dataframe, attribute, variable):
#     print(attribute,df.dtypes[(attribute)],type(variable))
    return dataframe[dataframe[attribute] <= variable].reset_index(drop=True), \
        dataframe[dataframe[attribute] > variable].reset_index(drop=True)



In [7]:
def get_plurality_value(dataframe, classes):
    
    values = getuniquevalues(dataframe, classes)
#     values = dataframe[classes].unique()
    global_max = 0
    majority_class = values[0]
    for value in values:
        
        curr = dataframe[classes].value_counts()[value]
        if (curr > global_max):
            global_max = curr
            majority_class = value
     
    return str(majority_class)


def getuniquevalues_with_count(dataframe, attribute):
    return np.unique(dataframe[attribute], return_counts=True)




In [8]:
# https://medium.com/@pytholabs/decision-trees-from-scratch-using-id3-python-coding-it-up-6b79e3458de4

def makeTree(df, features, Class, depth, max_depth):
    
    majority_class = get_plurality_value(df, Class)
    
    # depth reached
    if depth == max_depth:
        return str(majority_class)
    
    # attributes empty, 3rd condition
    if len(features) == 0:
        print('empty_vessel')
        return str(majority_class)
    
    root_feature, partition_value = find_winner(df, features, Class)
    print(root_feature)
       
#     if tree == None:
    tree = {}
    tree[root_feature] = {}
        
    
    # categorical feature
    if partition_value == -1:
        
        all_children =  getactuallabels(root_feature)
#         all_children =  getuniquevalues(df, root_feature)
        
        for value in all_children:
            
            subtable = get_object_subtable(df, root_feature, value)
            # how many yes, how many no
            distinct_values, count = getuniquevalues_with_count(subtable, Class)
            
            # all examples have same classification, 2nd condition
            if len(count) == 1:
                tree[root_feature][value] = distinct_values[0]
                
            #  example empty, return plurality of parent   
            elif(len(subtable) == 0):
                tree[root_feature][value] = str(majority_class)
                
            # build subtrees    
            else:
                features.remove(root_feature)
                tree[root_feature][value] = makeTree(subtable, features, Class, depth+1, max_depth)
                features.append(root_feature)
                
    # continious feature
    else:
        
        small_table, large_table = get_discretized_subtable(df, root_feature, partition_value)
        
        small_distinct_values, small_count = getuniquevalues_with_count(small_table, Class)
        
        # all examples have same classification, 2nd condition
        if len(small_count) == 1:
            tree[root_feature]["<="+str(partition_value)] = small_distinct_values[0]
            
        # example empty, return plurality of parent       
        elif(len(small_table) == 0):
            tree[root_feature]["<=" + str(partition_value)] = str(majority_class)
        
        # build subtrees    
        else:
            features.remove(root_feature)
            tree[root_feature]["<=" + str(partition_value)] = makeTree(small_table, features, Class, depth+1,max_depth)
            features.append(root_feature)

        
        large_distinct_values, large_count = getuniquevalues_with_count(large_table, Class)
        
        # all examples have same classification, 2nd condition
        if len(large_count) == 1:
            tree[root_feature][str(partition_value)] = large_distinct_values[0]
            
        # example empty, return plurality of parent       
        elif(len(large_table) == 0):
            tree[root_feature][str(partition_value)] = str(majority_class)
            
        # build subtrees        
        else:
            features.remove(root_feature)
            tree[root_feature][str(partition_value)] = makeTree(large_table, features, Class, depth+1,max_depth)
            features.append(root_feature)
            
    return tree
    
# test_df, train_df, df, features, classes = datafetcher('telco.csv')


# tree = makeTree(train_df, features, classes, 0, 2)

In [9]:
# pprint.pprint(tree)

# Accuracy

In [10]:
# <= maane less or equal
def find_accuracy(test_df, features, tree, df):
    
#     print(test_df)
    
    current_node = ''
    # iterate till the end
    while True:
        try:
            key_list = list(tree.keys())
            
            
            
            flag = 1
#             print('here0',tree)
            # check if a feature node 
            for i in key_list:
                if i in features:
                    flag = 0
                    current_node = i
                    break
            
            # value node 
            if flag:
                flag_2 = 1
#                 print('here1',tree,current_node)
                # object value
                if df.dtypes[current_node] == 'object':
#                     print("is-that-you-nith")
                    for i in key_list:
                        if i in test_df[current_node]:
                            tree = tree[i]
                            flag_2 = 0
                            break
                            
                    if flag_2:
                        print('here')
                        tree = tree[key_list[0]]
                            
                # discretized value
                else:
#                     print("is-that-you-discrete")
                    split_value = 0.0
                    
                    # get split value
                    if "<=" in key_list[0]:
                        split_value = float(key_list[1])
                    else:
                        split_value = float(key_list[0])
                    
                    # falls in the small tree
                    if float(test_df[current_node]) <= split_value:
                        tree = tree["<=" + str(split_value)]
                    # falls in the large tree   
                    else:
                        tree = tree[str(split_value)]
                 
                
            # feature node            
            else:
                tree = tree[current_node]
#                 print('here2',tree)
                
            
        except:
#             print("pop",tree ,test_df[-1])
            return str(tree), str(tree) == test_df[-1]  ## <>
    print("your-code-is-wrong")    
    return None, False    ## <>

def total_accuracy(test_df, features, tree):
    total_yes, total_no = 0, 0
    true_positive, true_negative = 0, 0 
    false_positive, false_negative = 0, 0 
    for i in range(len(test_df)):
        if i % 500 == 0:
            print(i,'th')
        predictedlabel, is_correct = find_accuracy(test_df.iloc[i], features, tree, test_df.head()) ## <> 
        if is_correct: ## <>
            total_yes += 1
            if test_df.iloc[i][-1] == 'Yes' :
                true_positive += 1
            else:
                true_negative += 1
        else:
            total_no += 1
            #should-have-been-true,but-the-model-said-false
            if test_df.iloc[i][-1] == 'Yes' :
                false_negative += 1
            else:
                false_positive += 1
                
    accuracy = (total_yes*100/(total_yes+total_no+eps))
    tpr = true_positive*100/(true_positive+false_negative+eps)
    tnr = true_negative*100/(true_negative+false_positive+eps)
    precision = true_positive*100/(true_positive+false_positive+eps)
    fdr = false_positive*100/(true_positive+false_positive+eps)
    f1_score = true_positive*200/((2*true_positive) + total_no)
                
    chart ={'Accuracy':accuracy, 'Sensitivity':tpr,'Specificity':tnr,'Precision':precision,
            'FDR':fdr, 'F1_Score':f1_score}
    df = pd.DataFrame(chart,columns=['Accuracy','Sensitivity','Specificity','Precision','FDR','F1_Score'], index=[0])


#     print(total_yes, total_no)
    return df

# total_accuracy(test_df, features, tree)
# for i in range(len(test_df)):
#     find_accuracy(test_df.iloc[i], features, tree)

# Run Decision Tree

In [11]:
def runDecisionTree(train_df, features):
#     test_df, train_df, df, features, classes = datafetcher(filename)
    tree = makeTree(train_df, features, classes, 0, len(features))
    print('done')
    
    return tree

filename= 'telco'   
train_df, features, classes = getdataframe(filename+'_train.csv')
test_df, features, classes = getdataframe(filename+'_test.csv')

def getactuallabels(root_feature, dataframe=train_df):
    return dataframe[root_feature].unique()
    
tree = runDecisionTree(train_df, features)

Contract
InternetService
TotalCharges
PaymentMethod
MonthlyCharges
MultipleLines
gender
Dependents
DeviceProtection
TechSupport
StreamingTV
Partner
OnlineSecurity
SeniorCitizen
tenure
PhoneService
OnlineBackup
StreamingMovies
PaperlessBilling
PaperlessBilling
OnlineBackup
OnlineBackup
PaperlessBilling
TechSupport
Partner
OnlineSecurity
StreamingMovies
TechSupport
Dependents
tenure
tenure
gender
tenure
SeniorCitizen
OnlineBackup
PaperlessBilling
tenure
OnlineBackup
MultipleLines
DeviceProtection
OnlineBackup
OnlineSecurity
MonthlyCharges
MonthlyCharges
Dependents
MultipleLines
TechSupport
PaperlessBilling
tenure
StreamingMovies
OnlineSecurity
OnlineSecurity
gender
SeniorCitizen
OnlineBackup
tenure
OnlineSecurity
tenure
SeniorCitizen
Partner
PaperlessBilling
OnlineBackup
OnlineSecurity
gender
TechSupport
Dependents
gender
PhoneService
StreamingTV
StreamingMovies
MultipleLines
Dependents
TechSupport
PaymentMethod
Dependents
gender
PaymentMethod
tenure
MultipleLines
Dependents
PaperlessBil

Dependents
StreamingMovies
TechSupport
PaymentMethod
Partner
OnlineBackup
DeviceProtection
PaymentMethod
TotalCharges
gender
StreamingTV
PhoneService
MonthlyCharges
PaperlessBilling
gender
PaymentMethod
PaperlessBilling
gender
TotalCharges
Partner
gender
StreamingTV
MonthlyCharges
TotalCharges
PhoneService
DeviceProtection
OnlineBackup
TechSupport
PaperlessBilling
StreamingTV
OnlineBackup
TotalCharges
gender
MonthlyCharges
PaperlessBilling
TechSupport
OnlineBackup
DeviceProtection
PhoneService
StreamingTV
MonthlyCharges
StreamingTV
DeviceProtection
OnlineBackup
PhoneService
TechSupport
PaperlessBilling
PaperlessBilling
TechSupport
PhoneService
OnlineBackup
DeviceProtection
StreamingTV
gender
Partner
PaperlessBilling
TotalCharges
PaymentMethod
TotalCharges
StreamingMovies
MultipleLines
SeniorCitizen
DeviceProtection
PaperlessBilling
Partner
OnlineBackup
gender
gender
MonthlyCharges
PaperlessBilling
gender
StreamingTV
Dependents
SeniorCitizen
DeviceProtection
gender
TechSupport
OnlineBac

MultipleLines
Dependents
gender
Partner
PaperlessBilling
TotalCharges
TechSupport
OnlineBackup
DeviceProtection
MonthlyCharges
StreamingMovies
OnlineSecurity
PhoneService
StreamingTV
StreamingTV
PhoneService
OnlineSecurity
StreamingMovies
MonthlyCharges
DeviceProtection
OnlineBackup
TechSupport
TotalCharges
Partner
PaperlessBilling
TotalCharges
TechSupport
OnlineBackup
DeviceProtection
MonthlyCharges
StreamingMovies
OnlineSecurity
PhoneService
StreamingTV
StreamingTV
PhoneService
OnlineSecurity
StreamingMovies
MonthlyCharges
DeviceProtection
OnlineBackup
TechSupport
TotalCharges
Partner
gender
TotalCharges
TechSupport
OnlineBackup
DeviceProtection
MonthlyCharges
StreamingMovies
OnlineSecurity
PhoneService
StreamingTV
PaperlessBilling
PaperlessBilling
gender
StreamingTV
PhoneService
OnlineSecurity
StreamingMovies
MonthlyCharges
DeviceProtection
OnlineBackup
TechSupport
TotalCharges
TotalCharges
TechSupport
OnlineBackup
DeviceProtection
MonthlyCharges
StreamingMovies
OnlineSecurity
Phone

gender
OnlineSecurity
MonthlyCharges
PhoneService
StreamingTV
DeviceProtection
MultipleLines
StreamingTV
PhoneService
MonthlyCharges
OnlineSecurity
gender
Dependents
TechSupport
OnlineBackup
MonthlyCharges
Dependents
StreamingMovies
TechSupport
PhoneService
PhoneService
PaperlessBilling
gender
StreamingTV
MultipleLines
OnlineBackup
gender
gender
OnlineSecurity
DeviceProtection
MultipleLines
StreamingMovies
OnlineSecurity
SeniorCitizen
Partner
OnlineBackup
gender
PaperlessBilling
gender
TechSupport
PhoneService
StreamingMovies
PhoneService
gender
OnlineBackup
Partner
OnlineSecurity
tenure
PaperlessBilling
Dependents
DeviceProtection
TechSupport
tenure
Partner
OnlineBackup
MonthlyCharges
gender
Dependents
TechSupport
PaperlessBilling
MultipleLines
PaperlessBilling
SeniorCitizen
StreamingMovies
PhoneService
StreamingTV
DeviceProtection
PaperlessBilling
TechSupport
Dependents
DeviceProtection
MultipleLines
MonthlyCharges
OnlineBackup
Dependents
TechSupport
StreamingTV
StreamingMovies
Senio

In [12]:
import json

with open(r'Generated_files/telco.json', 'w') as f:
    json.dump(tree, f)   

In [13]:
df_train = total_accuracy(train_df, features, tree)
print(df_train)

0 th
500 th
1000 th
1500 th
2000 th
2500 th
3000 th
3500 th
4000 th
4500 th
5000 th
5500 th
   Accuracy  Sensitivity  Specificity  Precision        FDR   F1_Score
0     91.68    84.013378    94.455206  84.579125  15.420875  84.295302


In [14]:
df_test = total_accuracy(test_df, features, tree)
print(df_test)

0 th
500 th
1000 th
    Accuracy  Sensitivity  Specificity  Precision        FDR   F1_Score
0  73.879004    53.351206     81.29845  50.765306  49.234694  52.026144


# PDF Generator

In [15]:
# https://towardsdatascience.com/creating-pdf-reports-with-python-pdfkit-and-jinja2-templates-64a89158fa2d
# install this- https://wkhtmltopdf.org/downloads.html
html_file_name='report_telco.html'
filenames = ['telco test' , 'telco train']
df_s = [df_test, df_train]
data_frames = []
for iter in range(len(filenames)):
#     df = {}
#     df['Accuracy:'] = [100.00]
#     df['True Positive:'] = [99.8]
#     df['True Negative:'] = [10.2]
#     df = pd.DataFrame(df)
#     df.index.name = 'year'
    data_frames.append({'df':df_s[iter],
        'filename':filenames[iter]})
    
import jinja2

templateLoader = jinja2.FileSystemLoader(searchpath="./")
templateEnv = jinja2.Environment(loader=templateLoader)
TEMPLATE_FILE = "pdf_accuracy.html"
template = templateEnv.get_template(TEMPLATE_FILE)



outputText = template.render(data_frames=data_frames, filename='report')
html_file = open(html_file_name, 'w')
html_file.write(outputText)
html_file.close()
    


# Adaboost

In [16]:
def total_accuracy_adaboost(test_df, features, tree, weights):
    error = 0.0 
    accuracy_map = np.ones((len(test_df)), dtype=int)
    predict_map = np.ones((len(test_df)), dtype=int)
    total_yes, total_no = 0, 0
    for i in range(len(test_df)):
        if i % 1500 == 0:
            print(i)
            
        ### <> start
        predictedlabel, is_correct = find_accuracy(test_df.iloc[i], features, tree, test_df.head()) ## <>
        if predictedlabel == 'Yes':
            predict_map[i] = 1
        else:
            predict_map[i] = -1
        ### <> end
        
        if is_correct:
            total_yes += 1
            accuracy_map[i] = 1
        else:
            total_no += 1
            error = error + weights[i]
            accuracy_map[i] = -1

    return 1-error, accuracy_map, predict_map ## <>

In [17]:
# https://thispointer.com/pandas-how-to-create-an-empty-dataframe-and-append-rows-columns-to-it-in-python/


def resample(train_df, weighted_probabilities):
    
    # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.random.choice.html
    value = np.random.choice(len(train_df), len(train_df), p=weighted_probabilities)
    
    resampled_df = pd.DataFrame(columns=train_df.columns)
    
    for iter in range(1000):#(len(train_df)):
        sampled_index = int(weighted_probabilities[iter])
        resampled_df = resampled_df.append(train_df.iloc[sampled_index], ignore_index=True)
        
    return resampled_df

def resample_pd(train_df, weighted_probabilities):
    #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
#     resampled_df = train_df.sample(n=len(train_df),replace=True, random_state=1, weights=weighted_probabilities)
    resampled_df = train_df.sample(n=len(train_df),replace=True, weights=weighted_probabilities)
#     print(len(resampled_df))
    resampled_df = resampled_df.reset_index(drop=True)
#     print(len(resampled_df))
    return resampled_df

 
# test_df, train_df, df, features, classes = datafetcher('telco.csv')
# weighted_probabilities = np.ones(len(train_df))
# weighted_probabilities = weighted_probabilities/weighted_probabilities.sum()
# resampled_df = resample(train_df, weighted_probabilities)

In [18]:
# tree = makeTree(resampled_df, features, classes, 0, 1)
# accuracy, accuracy_map = total_accuracy_adaboost(resampled_df, features, tree)
# print(accuracy)
# resampled_df.head()

In [19]:
def AdaBoost(train_df, features, boost_iter):
    
    sample_size = len(train_df)
    #     https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros.html
    decision_stumps = np.zeros((boost_iter, sample_size), dtype=int)
    all_trees = []    
    weighted_hypothesis = np.zeros(boost_iter)
    weighted_probabilities = np.ones(len(train_df))
    weighted_probabilities = weighted_probabilities/weighted_probabilities.sum()
    # normalization - https://stackoverflow.com/a/43644348
    
    iter = 0
    while iter < (boost_iter):
#         print(weighted_probabilities)
        resampled_df = resample_pd(train_df, weighted_probabilities)
#         print(resampled_df.head())
        tree = makeTree(resampled_df, features, classes, 0, 1)
#         pprint.pprint(tree)
        accuracy, accuracy_map, _ = total_accuracy_adaboost(resampled_df, features, tree, weighted_probabilities) ## <>
        print('accuracy',accuracy)
        if accuracy > 0.49:
            decision_stumps[iter] = accuracy_map
            # print(decision_stumps[iter], accuracy_map)
            all_trees.append(tree)
            # weight updation
            for dpnt in range(len(resampled_df)):
                if accuracy_map[dpnt] == 1:
                    weighted_probabilities[dpnt] = weighted_probabilities[dpnt] * ((1-accuracy)/(accuracy+eps))
                    if weighted_probabilities[dpnt] == 0:
                        weighted_probabilities[dpnt] = eps
            weighted_probabilities = weighted_probabilities/weighted_probabilities.sum()
            print(weighted_probabilities.sum())
            # weight updation
            
            weighted_hypothesis[iter] = log(accuracy/(1-accuracy+eps))
            
            iter += 1

    return decision_stumps, weighted_hypothesis, all_trees  


In [20]:
def getWeightedAccuracy(df, features, weighted_hypothesis, all_trees):
    print(len(weighted_hypothesis), len(all_trees))
    
    classes = df[df.keys()[-1]].to_numpy() == 'Yes' ## <>
#     classes = (classes == 'Yes').to_numpy() ## <>
    
    sample_size = len(df)
    boost_iter = len(all_trees)
    #     https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros.html
    decision_stumps = np.zeros((sample_size), dtype=float)
    majority_pred = np.zeros((sample_size), dtype=float) ## <>

    dummy_hypo = np.ones((sample_size), dtype=float)
    
    dummy_hypo = dummy_hypo/dummy_hypo.sum()
    
    for iter in range(boost_iter):
        _, accuracy_map, predict_map = total_accuracy_adaboost(df, features, all_trees[iter], dummy_hypo) ## <>
        decision_stumps += accuracy_map * weighted_hypothesis[iter]
        majority_pred += predict_map * weighted_hypothesis[iter]

    ## <> start
    majority_pred = majority_pred >= 0 
    adaboost_acc = 100 * np.sum(majority_pred == classes, axis=0) / sample_size
    
    print(adaboost_acc)
    
    print(100*(np.sum(np.array(decision_stumps) >= 0, axis=0))/sample_size)
    
    return adaboost_acc, (100*(np.sum(np.array(decision_stumps) >= 0, axis=0))/sample_size), majority_pred
    ## <> end
    #print(decision_stumps)
        

In [21]:
def RunAdaBoost(train_df, features, boost_iter):
    
#     print(decision_stumps)
    decision_stumps, weighted_hypothesis, all_trees = AdaBoost(train_df, features, boost_iter)
    return decision_stumps, weighted_hypothesis, all_trees

filename= 'telco'   
train_df, features, classes = getdataframe(filename+'_train.csv')
test_df, _, _ = getdataframe(filename+'_test.csv')   

df_test, df_train = {}, {}

for i in range(5,21,5):
    decision_stumps, weighted_hypothesis, all_trees = RunAdaBoost(train_df, features, i)
    _, test_accuracy, majority_pred = getWeightedAccuracy(test_df, features, weighted_hypothesis, all_trees)
    
    df_test['accuracy for '+ str(i) + ' stumps'] = [test_accuracy]
    
    _, train_accuracy, majority_pred = getWeightedAccuracy(train_df, features, weighted_hypothesis, all_trees)
    df_train['accuracy for '+ str(i) + ' stumps'] = [train_accuracy]
    
    print(test_accuracy, train_accuracy)
    
df_test = pd.DataFrame(df_test)
df_train = pd.DataFrame(df_train)

Contract
0
1500
3000
4500
accuracy 0.7192888888888904
0.9999999999999999
Contract
0
1500
3000
4500
accuracy 0.7317705162011157
0.9999999999999999
Contract
0
1500
3000
4500
accuracy 0.7472683984421871
0.9999999999999999
Contract
0
1500
3000
4500
accuracy 0.7485775291869732
0.9999999999999999
Contract
0
1500
3000
4500
accuracy 0.7277694384272855
1.0
5 5
0
0
0
0
0
73.45195729537366
73.45195729537366
5 5
0
1500
3000
4500
0
1500
3000
4500
0
1500
3000
4500
0
1500
3000
4500
0
1500
3000
4500
73.42222222222222
73.42222222222222
73.45195729537366 73.42222222222222
Contract
0
1500
3000
4500
accuracy 0.7338666666666681
1.0
Contract
0
1500
3000
4500
accuracy 0.7318746019170153
0.9999999999999999
Contract
0
1500
3000
4500
accuracy 0.7300964162985859
1.0000000000000002
Contract
0
1500
3000
4500
accuracy 0.7456739270766537
1.0
Contract
0
1500
3000
4500
accuracy 0.7396456723286364
1.0000000000000002
Contract
0
1500
3000
4500
accuracy 0.7697601687796752
1.0
Contract
0
1500
3000
4500
accuracy 0.748983698

In [27]:
# df_train

df_test

Unnamed: 0,accuracy for 5 stumps,accuracy for 10 stumps,accuracy for 15 stumps,accuracy for 20 stumps
0,73.451957,73.451957,73.451957,73.451957


# PDF Generator

In [23]:
# https://towardsdatascience.com/creating-pdf-reports-with-python-pdfkit-and-jinja2-templates-64a89158fa2d
# install this- https://wkhtmltopdf.org/downloads.html
html_file_name='report_telco_adaboost.html'
filenames = ['telco test adaboost' , 'telco train adaboost']
df_s = [df_test, df_train]
data_frames = []
for iter in range(len(filenames)):
#     df = {}
#     df['Accuracy:'] = [100.00]
#     df['True Positive:'] = [99.8]
#     df['True Negative:'] = [10.2]
#     df = pd.DataFrame(df)
#     df.index.name = 'year'
    data_frames.append({'df':df_s[iter],
        'filename':filenames[iter]})
    
import jinja2

templateLoader = jinja2.FileSystemLoader(searchpath="./")
templateEnv = jinja2.Environment(loader=templateLoader)
TEMPLATE_FILE = "pdf_accuracy.html"
template = templateEnv.get_template(TEMPLATE_FILE)

# for d in data_frames:
#     outputText = template.render(df=d['df'], filename=d['filename'])
#     html_file = open(d['filename'] + '.html', 'w')
#     html_file.write(outputText)
#     html_file.close()
    
# import pdfkit
# for i in filenames:
#     pdfkit.from_file(str(i) + '.html', str(i) + '.pdf')    

outputText = template.render(data_frames=data_frames, filename='report')
html_file = open(html_file_name, 'w')
html_file.write(outputText)
html_file.close()
    


In [24]:
import glob
html_files = glob.glob('report_*.html')
html_files = list(html_files)
print(html_files)

['report_credit.html', 'report_telco.html', 'report_telco_adaboost.html']


In [25]:
import pdfkit
pdfkit.from_file(html_files, '1505052.pdf')    
#pdfkit.from_file(['file1.html', 'file2.html'], 'out.pdf')

Loading pages (1/6)
Printing pages (6/6)


True