## Task 1

In [1135]:
import pandas as pd
import numpy as np

In [1136]:
all_cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
cont_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
cat_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

In [1137]:
random_states = [80, 225, 2067, 4133, 6741, 9931, 58146, 79327, 3114652, 8538320]
random_state = random_states[9]

### The Dataset

The dataset provided had 32561 examples, with each example containing 14 features and a label indicating whether the annual income was $\le$ 50K. Out of the 14 features, 6 features had continuous values (namely, "age", "fnlwgt", "education-num", "capital-gain", "capital-loss" and "hours-per-week") and the remaining 8 features had discrete values (namely, "workclass", "education", "marital-status", "occupation", "relationship", "race", "sex" and "native-country"). There were 24720 examples with income $\le$ 50K and 7841 examples with income $>$ 50K.

Here are some of the examples from the dataset:

In [1138]:
df = pd.read_csv("adult.data", index_col=False, names=all_cols)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [1139]:
df['income'].value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

#### Fill Missing Values

There were 2399 examples which had one or more missing values. We filled the missing values by going over each column and replacing the missing values with the most commonly occurring non-missing value in case of discrete features and with the mean of all the non-missing values in case of continuous features.

In [1140]:
df[df.isin([" ?"]).any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
38,31,Private,84154,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,38,?,>50K
51,18,Private,226956,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,30,?,<=50K
61,32,?,293936,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32531,30,?,33811,Bachelors,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32541,41,?,202822,HS-grad,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [1141]:
df = df.replace(" ?", np.nan)
df[df.isin([" ?"]).any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [1142]:
df[df.isnull().any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,,>50K
27,54,,180211,Some-college,10,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
38,31,Private,84154,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,38,,>50K
51,18,Private,226956,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,30,,<=50K
61,32,,293936,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,40,,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,,320084,Bachelors,13,Married-civ-spouse,,Wife,White,Female,0,0,55,United-States,>50K
32531,30,,33811,Bachelors,13,Never-married,,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,,287372,Doctorate,16,Married-civ-spouse,,Husband,White,Male,0,0,10,United-States,>50K
32541,41,,202822,HS-grad,9,Separated,,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [1143]:
# cont_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
for column in cont_cols:
    val = df[column].mean(skipna=True)
    df[column] = df[column].replace(np.nan, val)

In [1144]:
# cat_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
for column in cat_cols:
    df[column] = df[column].replace(np.nan, drop_df[column].mode()[0])

In [1145]:
df[df.isnull().any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [1146]:
# df.to_csv("nb_filled.csv", index=None)

#### Create Training and Testing Sets

We split the dataset and used 67% of the dataset for training and 33% for testing. Our testing split contained 21816 examples and training split contained 10745 examples.

In [1147]:
def create_split(df, rs):
    training_split = df.sample(frac = 0.67, random_state=rs)
    testing_split = df.drop(training_split.index)

    return (training_split, testing_split)

In [1148]:
df = pd.read_csv('nb_filled.csv')
train, test = create_split(df, random_state)

Here are some examples from the training split:

In [1149]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
23092,34,Private,176711,HS-grad,9,Divorced,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K
8428,49,Private,166857,9th,5,Divorced,Handlers-cleaners,Not-in-family,White,Female,0,0,40,United-States,<=50K
14308,45,Private,200559,Assoc-voc,11,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,70,United-States,<=50K
15619,48,Private,182566,Bachelors,13,Married-civ-spouse,Sales,Husband,Black,Male,0,0,40,United-States,>50K
25338,56,Private,147202,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,45,Germany,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18203,58,Self-emp-not-inc,165315,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,35,United-States,>50K
3930,20,Private,400443,HS-grad,9,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,<=50K
470,60,Private,201965,Some-college,10,Never-married,Prof-specialty,Unmarried,White,Male,0,0,40,United-States,>50K
15562,33,Local-gov,281784,Bachelors,13,Never-married,Tech-support,Not-in-family,Black,Male,0,1564,52,United-States,>50K


Here are some examples from the testing split:

In [1150]:
test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32541,41,Private,202822,HS-grad,9,Separated,Prof-specialty,Not-in-family,Black,Female,0,0,32,United-States,<=50K
32544,31,Private,199655,Masters,14,Divorced,Other-service,Not-in-family,Other,Female,0,0,30,United-States,<=50K
32545,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K
32550,43,Self-emp-not-inc,27242,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States,<=50K


In [1151]:
bin_dict = {}
for col in cont_cols:
    result, bins = pd.cut(df[col], bins=10, retbins=True)
    bin_dict[col] = bins
bin_dict

{'age': array([16.927, 24.3  , 31.6  , 38.9  , 46.2  , 53.5  , 60.8  , 68.1  ,
        75.4  , 82.7  , 90.   ]),
 'fnlwgt': array([  10812.58,  159527.  ,  306769.  ,  454011.  ,  601253.  ,
         748495.  ,  895737.  , 1042979.  , 1190221.  , 1337463.  ,
        1484705.  ]),
 'education-num': array([ 0.985,  2.5  ,  4.   ,  5.5  ,  7.   ,  8.5  , 10.   , 11.5  ,
        13.   , 14.5  , 16.   ]),
 'capital-gain': array([  -99.999,  9999.9  , 19999.8  , 29999.7  , 39999.6  , 49999.5  ,
        59999.4  , 69999.3  , 79999.2  , 89999.1  , 99999.   ]),
 'capital-loss': array([  -4.356,  435.6  ,  871.2  , 1306.8  , 1742.4  , 2178.   ,
        2613.6  , 3049.2  , 3484.8  , 3920.4  , 4356.   ]),
 'hours-per-week': array([ 0.902, 10.8  , 20.6  , 30.4  , 40.2  , 50.   , 59.8  , 69.6  ,
        79.4  , 89.2  , 99.   ])}

In [1152]:
cat_dict = {}
for col in cat_cols:
    cat_dict[col] = df[col].unique()
cat_dict

{'workclass': array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
        ' Local-gov', ' Self-emp-inc', ' Without-pay', ' Never-worked'],
       dtype=object),
 'education': array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
        ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
        ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
        ' Preschool', ' 12th'], dtype=object),
 'marital-status': array([' Never-married', ' Married-civ-spouse', ' Divorced',
        ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
        ' Widowed'], dtype=object),
 'occupation': array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
        ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
        ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
        ' Tech-support', ' Protective-serv', ' Armed-Forces',
        ' Priv-house-serv'], dtype=object),
 'relationship': array([' Not-in-family', ' H

## Task 2

### Calculating Prior Probability

We calculated prior probability of each of two classes using maximum likelihood estimator.

In [1153]:
def calculate_prior(df):
    return df['income'].value_counts(normalize=True).to_dict()

In [1154]:
prior = calculate_prior(train)
prior

{' <=50K': 0.7582508250825083, ' >50K': 0.24174917491749176}

### Calculating Conditional Probability

We calculated conditional probabilities of every feature assuming that features are independent of each other given the income class. We binned the continuous features into 10 bins of equal size for calculating likelihood.

In [1155]:
def calculate_conditional(df, bin_dict, cat_dict, cat_cols, cont_cols, smooth=False, alpha=1):
    grouped = df.groupby(df.income)
    df1 = grouped.get_group(" <=50K")
    df2 = grouped.get_group(" >50K")
    cond1, cond2 = dict(), dict()
    if smooth == False:
        for col in cat_cols:
            cond1[col] = dict.fromkeys(cat_dict[col], 0)
            cond2[col] = dict.fromkeys(cat_dict[col], 0)
            cond1[col].update(df1[col].value_counts(normalize=True).to_dict())
            cond2[col].update(df2[col].value_counts(normalize=True).to_dict())
        for col in cont_cols:
            # cut = pd.cut(df1[col], bins=bin_dict[col], labels=bin_dict[col][:-1])
            # cond1[col] = cut.value_counts(normalize=True).to_dict()
            # cut = pd.cut(df2[col], bins=bin_dict[col], labels=bin_dict[col][:-1])
            # cond2[col] = cut.value_counts(normalize=True).to_dict()
            cond1[col] = df1[col].value_counts(normalize=True, bins=bin_dict[col]).to_dict()
            cond2[col] = df2[col].value_counts(normalize=True, bins=bin_dict[col]).to_dict()
    else:
        for col in cat_cols:
            k = len(cat_dict[col])          # k = (no. of categories)
            cond1[col] = dict.fromkeys(cat_dict[col], 0)
            cond2[col] = dict.fromkeys(cat_dict[col], 0)
            cond1[col].update(df1[col].value_counts().to_dict())
            cond2[col].update(df2[col].value_counts().to_dict())
            N = sum(cond1[col].values())
            cond1[col] = {cat:((cnt + alpha) / (N + k * alpha)) for cat, cnt in cond1[col].items()}
            N = sum(cond2[col].values())
            cond2[col] = {cat:((cnt + alpha) / (N + k * alpha)) for cat, cnt in cond2[col].items()}
        for col in cont_cols:
            k = len(bin_dict[col]) - 1      # k = (no. of bins)
            cond1[col] = df1[col].value_counts(bins=bin_dict[col]).to_dict()
            cond2[col] = df2[col].value_counts(bins=bin_dict[col]).to_dict()
            N = sum(cond1[col].values())
            cond1[col] = {cat:((cnt + alpha) / (N + k * alpha)) for cat, cnt in cond1[col].items()}
            N = sum(cond2[col].values())
            cond2[col] = {cat:((cnt + alpha) / (N + k * alpha)) for cat, cnt in cond2[col].items()}
    return cond1, cond2

In [1156]:
cond1, cond2 = calculate_conditional(train, bin_dict, cat_dict, cat_cols, cont_cols)

In [1157]:
# import json

# with open('cond1.json', 'w') as fp:
#     json.dump(cond1, fp)
# with open('cond2.json', 'w') as fp:
#     json.dump(cond2, fp)

In [1158]:
import pprint
# pprint.pprint(cond1, indent=4)
# pprint.pprint(cond2, indent=4)

### Predicting Class

We made predictions for the testing split using prior and likelihood that we had calculated earlier. We ignored the denominator since it is common to both income classes and directly compared the numerators to make our prediction.

In [1159]:
def predict_class(row, prior, cond1, cond2):
    num1 = prior[' <=50K']
    num2 = prior[' >50K']
    for col in cat_cols:
        num1 *= cond1[col][row[col]]
        num2 *= cond2[col][row[col]]
    for col in cont_cols:
        for interval, val in cond1[col].items():
            if row[col] in interval:
                num1 *= cond1[col][interval]
                num2 *= cond2[col][interval]
                break
    if num1 >= num2:
        return ' <=50K'
    return ' >50K'

In [1160]:
test['prediction'] = test.apply(lambda row: predict_class(row, prior, cond1, cond2), axis=1)
test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,prediction
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,>50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K,>50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32541,41,Private,202822,HS-grad,9,Separated,Prof-specialty,Not-in-family,Black,Female,0,0,32,United-States,<=50K,<=50K
32544,31,Private,199655,Masters,14,Divorced,Other-service,Not-in-family,Other,Female,0,0,30,United-States,<=50K,<=50K
32545,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K,<=50K
32550,43,Self-emp-not-inc,27242,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States,<=50K,>50K


## Task 3

### Evaluation

In [1161]:
# res_df = pd.DataFrame(columns=['Random State', 'Alpha(Smoothing)', 'TP', 'FP', 'TN', 'FN', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
# res_df.to_csv("results.csv", index=None)
res_df = pd.read_csv("results.csv")
res_df

Unnamed: 0,Random State,Alpha(Smoothing),TP,FP,TN,FN,Accuracy,Precision,Recall,F1-Score
0,80.0,0.0,6873.0,577.0,1975.0,1320.0,0.693346,0.92255,0.838887,0.878732
1,80.0,1.0,6873.0,587.0,1965.0,1320.0,0.694276,0.921314,0.838887,0.87817
2,80.0,10.0,6918.0,630.0,1922.0,1275.0,0.702466,0.916534,0.844379,0.878978
3,80.0,50.0,7091.0,779.0,1773.0,1102.0,0.732434,0.901017,0.865495,0.882899
4,80.0,100.0,7294.0,943.0,1609.0,899.0,0.766589,0.885517,0.890272,0.887888
5,225.0,0.0,6786.0,588.0,2022.0,1349.0,0.686273,0.92026,0.834173,0.875105
6,225.0,1.0,6786.0,592.0,2018.0,1349.0,0.686645,0.919761,0.834173,0.874879
7,225.0,10.0,6830.0,616.0,1994.0,1305.0,0.692973,0.917271,0.839582,0.876709
8,225.0,50.0,7024.0,760.0,1850.0,1111.0,0.72443,0.902364,0.86343,0.882467
9,225.0,100.0,7227.0,951.0,1659.0,908.0,0.761098,0.883712,0.888384,0.886042


We evaluated our predictions by calculating the accuracy, precision, recall and F1-score. We calculated them using the following formulae:

$$ Accuracy = \frac{TP + TN}{TP + FP + TN + FN} $$
$$ Precision = \frac{TP}{TP + FP} $$
$$ Recall = \frac{TP}{TP + FN} $$
$$ F1-Score = \frac{2}{\frac{1}{Precision} + \frac{1}{Recall}} $$

In [1162]:
def evaluate(df, smooth=False, alpha=0):
    grouped = df.groupby(df.income)
    df1 = grouped.get_group(" <=50K")
    df2 = grouped.get_group(" >50K")
    if smooth:
        dict1 = df1['prediction_smoothing'].value_counts().to_dict()
        dict2 = df2['prediction_smoothing'].value_counts().to_dict()
    else:
        dict1 = df1['prediction'].value_counts().to_dict()
        dict2 = df2['prediction'].value_counts().to_dict()
    tp = dict1[" <=50K"]
    fp = dict2[" <=50K"]
    tn = dict2[" >50K"]
    fn = dict1[" >50K"]
    print(f"TP: {tp}\t FP: {fp}\t TN: {tn}\t FN: {fn}")
    acc = (tp + fp) / (tp + fp + tn + fn)
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = 2 / (1 / prec + 1 / rec)
    print(f"Accuracy: {acc}")
    print(f"Precision: {prec}")
    print(f"Recall: {rec}")
    print(f"F1-Score: {f1}")
    entry = [random_state, alpha, tp, fp, tn, fn, acc, prec, rec, f1]
    res_df.loc[len(res_df)] = entry

In [1163]:
evaluate(test[['income', 'prediction']])

TP: 6833	 FP: 582	 TN: 1985	 FN: 1345
Accuracy: 0.690088413215449
Precision: 0.9215104517869184
Recall: 0.8355343604793348
F1-Score: 0.8764189059193227


### Smoothing

We applied Laplacian smoothing with $\alpha$ values as 1, 10, 50 and 100. We used the following formula for smoothing:

$$ P(\textnormal{education=`11th' | income=`> 50K'}) = \frac{\textnormal{No. of examples with education=`11th' and income=`> 50K'} + \alpha}{\textnormal{No. of examples with income=`> 50K'} + K \alpha} $$

where $K$ is the number of discrete values that education can take.

In [1164]:
cond1_smooth, cond2_smooth = calculate_conditional(train, bin_dict, cat_dict, cat_cols, cont_cols, smooth=True)

In [1165]:
# pprint.pprint(cond1_smooth, indent=4)
# pprint.pprint(cond2_smooth, indent=4)

In [1166]:
test['prediction_smoothing'] = test.apply(lambda row: predict_class(row, prior, cond1_smooth, cond2_smooth), axis=1)
test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,prediction,prediction_smoothing
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,<=50K,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,>50K,>50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,>50K,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K,>50K,>50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K,>50K,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32541,41,Private,202822,HS-grad,9,Separated,Prof-specialty,Not-in-family,Black,Female,0,0,32,United-States,<=50K,<=50K,<=50K
32544,31,Private,199655,Masters,14,Divorced,Other-service,Not-in-family,Other,Female,0,0,30,United-States,<=50K,<=50K,<=50K
32545,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K,<=50K,<=50K
32550,43,Self-emp-not-inc,27242,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States,<=50K,>50K,>50K


In [1167]:
for alpha in [1, 10, 50, 100]:
    print(f"alpha={alpha}")
    cond1_smooth, cond2_smooth = calculate_conditional(train, bin_dict, cat_dict, cat_cols, cont_cols, smooth=True, alpha=alpha)
    test['prediction_smoothing'] = test.apply(lambda row: predict_class(row, prior, cond1_smooth, cond2_smooth), axis=1)    
    evaluate(test[['income', 'prediction_smoothing']], smooth=True, alpha=alpha)
    print()

alpha=1
TP: 6835	 FP: 587	 TN: 1980	 FN: 1343
Accuracy: 0.6907398790134947
Precision: 0.9209108057127459
Recall: 0.8357789190511128
F1-Score: 0.8762820512820513

alpha=10
TP: 6853	 FP: 604	 TN: 1963	 FN: 1325
Accuracy: 0.6939972080037227
Precision: 0.9190022797371598
Recall: 0.8379799461971142
F1-Score: 0.8766229613047649

alpha=50
TP: 7039	 FP: 753	 TN: 1814	 FN: 1139
Accuracy: 0.7251744997673336
Precision: 0.9033624229979466
Recall: 0.8607238933724627
F1-Score: 0.8815278647463995

alpha=100
TP: 7231	 FP: 930	 TN: 1637	 FN: 947
Accuracy: 0.7595160539785947
Precision: 0.8860433770371278
Recall: 0.884201516263145
F1-Score: 0.8851214884631863



In [1168]:
res_df.to_csv("results.csv", index=None)
res_df

Unnamed: 0,Random State,Alpha(Smoothing),TP,FP,TN,FN,Accuracy,Precision,Recall,F1-Score
0,80.0,0.0,6873.0,577.0,1975.0,1320.0,0.693346,0.92255,0.838887,0.878732
1,80.0,1.0,6873.0,587.0,1965.0,1320.0,0.694276,0.921314,0.838887,0.87817
2,80.0,10.0,6918.0,630.0,1922.0,1275.0,0.702466,0.916534,0.844379,0.878978
3,80.0,50.0,7091.0,779.0,1773.0,1102.0,0.732434,0.901017,0.865495,0.882899
4,80.0,100.0,7294.0,943.0,1609.0,899.0,0.766589,0.885517,0.890272,0.887888
5,225.0,0.0,6786.0,588.0,2022.0,1349.0,0.686273,0.92026,0.834173,0.875105
6,225.0,1.0,6786.0,592.0,2018.0,1349.0,0.686645,0.919761,0.834173,0.874879
7,225.0,10.0,6830.0,616.0,1994.0,1305.0,0.692973,0.917271,0.839582,0.876709
8,225.0,50.0,7024.0,760.0,1850.0,1111.0,0.72443,0.902364,0.86343,0.882467
9,225.0,100.0,7227.0,951.0,1659.0,908.0,0.761098,0.883712,0.888384,0.886042


In [1169]:
res_df.groupby('Alpha(Smoothing)').mean()

Unnamed: 0_level_0,Random State,TP,FP,TN,FN,Accuracy,Precision,Recall,F1-Score
Alpha(Smoothing),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,1181362.2,6821.0,587.8,1992.0,1344.2,0.689511,0.92066,0.835379,0.875944
1.0,1181362.2,6821.0,591.2,1988.6,1344.2,0.689828,0.920238,0.835378,0.875753
10.0,1181362.2,6853.1,621.8,1958.0,1312.1,0.695663,0.916818,0.83931,0.876347
50.0,1181362.2,7036.6,772.9,1806.9,1128.6,0.726803,0.901034,0.861786,0.880967
100.0,1181362.2,7243.1,948.4,1631.4,922.1,0.762355,0.884225,0.887078,0.885641
