In [96]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import csv
import pandas as pd
import json

# Load dataset
data = pd.read_csv('nb_filled.csv')

In [97]:
all_cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
cont_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
cat_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

In [98]:
bin_dict = {}
for col in cont_cols:
    result, bins = pd.cut(data[col], bins=10, retbins=True)
    bin_dict[col] = bins
bin_dict

{'age': array([16.927, 24.3  , 31.6  , 38.9  , 46.2  , 53.5  , 60.8  , 68.1  ,
        75.4  , 82.7  , 90.   ]),
 'fnlwgt': array([  10812.58,  159527.  ,  306769.  ,  454011.  ,  601253.  ,
         748495.  ,  895737.  , 1042979.  , 1190221.  , 1337463.  ,
        1484705.  ]),
 'education-num': array([ 0.985,  2.5  ,  4.   ,  5.5  ,  7.   ,  8.5  , 10.   , 11.5  ,
        13.   , 14.5  , 16.   ]),
 'capital-gain': array([  -99.999,  9999.9  , 19999.8  , 29999.7  , 39999.6  , 49999.5  ,
        59999.4  , 69999.3  , 79999.2  , 89999.1  , 99999.   ]),
 'capital-loss': array([  -4.356,  435.6  ,  871.2  , 1306.8  , 1742.4  , 2178.   ,
        2613.6  , 3049.2  , 3484.8  , 3920.4  , 4356.   ]),
 'hours-per-week': array([ 0.902, 10.8  , 20.6  , 30.4  , 40.2  , 50.   , 59.8  , 69.6  ,
        79.4  , 89.2  , 99.   ])}

In [99]:
cat_dict = {}
for col in cat_cols:
    cat_dict[col] = data[col].unique()
cat_dict

{'workclass': array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
        ' Local-gov', ' Self-emp-inc', ' Without-pay', ' Never-worked'],
       dtype=object),
 'education': array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
        ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
        ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
        ' Preschool', ' 12th'], dtype=object),
 'marital-status': array([' Never-married', ' Married-civ-spouse', ' Divorced',
        ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
        ' Widowed'], dtype=object),
 'occupation': array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
        ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
        ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
        ' Tech-support', ' Protective-serv', ' Armed-Forces',
        ' Priv-house-serv'], dtype=object),
 'relationship': array([' Not-in-family', ' H

In [100]:
grouped = data.groupby(data.income)
df1 = grouped.get_group(" <=50K")
df2 = grouped.get_group(" >50K")
cond1, cond2 = dict(), dict()

In [101]:
with open('cond1.json') as f:
    cond1 = json.load(f)

In [102]:
with open('cond2.json') as f:
    cond2 = json.load(f)

In [103]:
for col in cat_cols:
    df1[col] = df1[col].replace(cond1[col])
    df2[col] = df2[col].replace(cond2[col])

for col in cont_cols:
    df1[col] = df1[col].replace(cond1[col])
    df2[col] = df2[col].replace(cond2[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[col] = df1[col].replace(cond1[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].replace(cond2[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[col] = df1[col].replace(cond1[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [104]:
combined_df = pd.concat([df1, df2], axis=0)

In [105]:
combined_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,0.036565,77516,0.127403,13,0.413134,0.132618,0.302711,0.839670,0.607786,2174,0,40,0.906494,<=50K
1,50,0.074162,83311,0.127403,13,0.331817,0.087320,0.291128,0.839670,0.607786,0,0,13,0.906494,<=50K
2,38,0.786247,215646,0.358438,9,0.163362,0.052695,0.302711,0.839670,0.607786,0,0,40,0.906494,<=50K
3,53,0.786247,234721,0.045843,7,0.331817,0.052695,0.291128,0.110424,0.607786,0,0,40,0.906494,<=50K
4,28,0.786247,338409,0.127403,13,0.331817,0.160269,0.033352,0.110424,0.392214,0,0,40,0.002668,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32539,71,0.656526,287372,0.037746,16,0.852582,0.256526,0.751362,0.905915,0.845258,0,0,10,0.933897,>50K
32545,39,0.077371,111499,0.032864,12,0.852582,0.065540,0.097653,0.905915,0.154742,0,0,20,0.933897,>50K
32554,53,0.656526,321865,0.119812,14,0.852582,0.253709,0.751362,0.905915,0.845258,0,0,40,0.933897,>50K
32557,40,0.656526,154374,0.212770,9,0.852582,0.031925,0.751362,0.905915,0.845258,0,0,40,0.933897,>50K


In [106]:
def LREval(rs):
    # Separate features and target variable
    X = combined_df.drop('income', axis=1)
    y = combined_df['income']

    # train , test = create_split(combined_df,rs)
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs,test_size=0.33)

    # Initialize logistic regression model
    model = LogisticRegression()

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    return accuracy,cm

In [107]:
res = [['Random State','TN','FP','FN','TP','Accuracy','Precision','Recall','F1-Score']]

In [108]:
for random_state in [80, 225, 2067, 4133, 6741, 9931, 58146, 79327, 3114652, 8538320]:
    curr_res = []
    acc , confmat = LREval(random_state)
    print(f"Accuracy: {acc}")
    print(f"TP: {confmat[0][0]}\t FN: {confmat[0][1]}\t FP: {confmat[1][0]}\t TN: {confmat[1][1]}")
    acc = (confmat[1][1] + confmat[0][0]) / (confmat[1][1] + confmat[0][1] + confmat[0][0] + confmat[1][0])
    prec = confmat[0][0] / (confmat[1][0] + confmat[0][0])
    rec = confmat[0][0] / (confmat[0][1] + confmat[0][0])
    f1 = 2 / (1 / prec + 1 / rec)
    print(f"Accuracy: {acc}")
    print(f"Precision: {prec}")
    print(f"Recall: {rec}")
    print(f"F1-Score: {f1}")
    curr_res.append(random_state)
    curr_res.append(confmat[0][0])
    curr_res.append(confmat[0][1])
    curr_res.append(confmat[1][0])
    curr_res.append(confmat[1][1])
    curr_res.append(acc)
    curr_res.append(prec)
    curr_res.append(rec)
    curr_res.append(f1)
    res.append(curr_res)
    print("\n\n")
    

Accuracy: 0.7950865438302624
TP: 7874	 FN: 274	 FP: 1928	 TN: 670
Accuracy: 0.7950865438302624
Precision: 0.8033054478677821
Recall: 0.966372115856652
F1-Score: 0.877325905292479



Accuracy: 0.7967615857063093
TP: 7870	 FN: 260	 FP: 1924	 TN: 692
Accuracy: 0.7967615857063093
Precision: 0.8035531958341842
Recall: 0.968019680196802
F1-Score: 0.8781521981700513



Accuracy: 0.798250511818351
TP: 7935	 FN: 264	 FP: 1904	 TN: 643
Accuracy: 0.798250511818351
Precision: 0.8064843988210184
Recall: 0.9678009513355287
F1-Score: 0.8798092914957313



Accuracy: 0.8025311743904708
TP: 7922	 FN: 298	 FP: 1824	 TN: 702
Accuracy: 0.8025311743904708
Precision: 0.8128462959162733
Recall: 0.9637469586374696
F1-Score: 0.8818880106868531



Accuracy: 0.7995533221663875
TP: 7890	 FN: 266	 FP: 1888	 TN: 702
Accuracy: 0.7995533221663875
Precision: 0.8069134792391082
Recall: 0.9673859735164296
F1-Score: 0.8798929407828705



Accuracy: 0.7980643960543458
TP: 7898	 FN: 252	 FP: 1918	 TN: 678
Accuracy: 0.7980643

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8793039270426205
TP: 7779	 FN: 379	 FP: 918	 TN: 1670
Accuracy: 0.8793039270426205
Precision: 0.8944463608140738
Recall: 0.953542534935033
F1-Score: 0.9230495401957876



Accuracy: 0.7972268751163224
TP: 7919	 FN: 282	 FP: 1897	 TN: 648
Accuracy: 0.7972268751163224
Precision: 0.8067440912795436
Recall: 0.9656139495183514
F1-Score: 0.879058666814675



Accuracy: 0.7943420807742416
TP: 7821	 FN: 267	 FP: 1943	 TN: 715
Accuracy: 0.7943420807742416
Precision: 0.8010036870135191
Recall: 0.9669881305637982
F1-Score: 0.8762043468518934



Accuracy: 0.7958310068862833
TP: 7858	 FN: 264	 FP: 1930	 TN: 694
Accuracy: 0.7958310068862833
Precision: 0.8028197793216183
Recall: 0.9674956907165723
F1-Score: 0.8774986041317698





In [109]:
# open a CSV file for writing
with open('LRResults.csv', 'w', newline='') as file:
    # create a CSV writer object
    writer = csv.writer(file)
    
    # write the data to the file
    for row in res:
        writer.writerow(row)