In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import csv
import pandas as pd
import json

# Load dataset
data = pd.read_csv('nb_filled.csv')

In [2]:
all_cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
cont_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
cat_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

In [3]:
bin_dict = {}
for col in cont_cols:
    result, bins = pd.cut(data[col], bins=10, retbins=True)
    bin_dict[col] = bins
bin_dict

{'age': array([16.927, 24.3  , 31.6  , 38.9  , 46.2  , 53.5  , 60.8  , 68.1  ,
        75.4  , 82.7  , 90.   ]),
 'fnlwgt': array([  10812.58,  159527.  ,  306769.  ,  454011.  ,  601253.  ,
         748495.  ,  895737.  , 1042979.  , 1190221.  , 1337463.  ,
        1484705.  ]),
 'education-num': array([ 0.985,  2.5  ,  4.   ,  5.5  ,  7.   ,  8.5  , 10.   , 11.5  ,
        13.   , 14.5  , 16.   ]),
 'capital-gain': array([  -99.999,  9999.9  , 19999.8  , 29999.7  , 39999.6  , 49999.5  ,
        59999.4  , 69999.3  , 79999.2  , 89999.1  , 99999.   ]),
 'capital-loss': array([  -4.356,  435.6  ,  871.2  , 1306.8  , 1742.4  , 2178.   ,
        2613.6  , 3049.2  , 3484.8  , 3920.4  , 4356.   ]),
 'hours-per-week': array([ 0.902, 10.8  , 20.6  , 30.4  , 40.2  , 50.   , 59.8  , 69.6  ,
        79.4  , 89.2  , 99.   ])}

In [4]:
cat_dict = {}
for col in cat_cols:
    cat_dict[col] = data[col].unique()
cat_dict

{'workclass': array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
        ' Local-gov', ' Self-emp-inc', ' Without-pay', ' Never-worked'],
       dtype=object),
 'education': array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
        ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
        ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
        ' Preschool', ' 12th'], dtype=object),
 'marital-status': array([' Never-married', ' Married-civ-spouse', ' Divorced',
        ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
        ' Widowed'], dtype=object),
 'occupation': array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
        ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
        ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
        ' Tech-support', ' Protective-serv', ' Armed-Forces',
        ' Priv-house-serv'], dtype=object),
 'relationship': array([' Not-in-family', ' H

In [5]:
grouped = data.groupby(data.income)
df1 = grouped.get_group(" <=50K")
df2 = grouped.get_group(" >50K")
cond1, cond2 = dict(), dict()

In [6]:
with open('cond1.json') as f:
    cond1 = json.load(f)

In [7]:
with open('cond2.json') as f:
    cond2 = json.load(f)

In [8]:
for col in cat_cols:
    df1[col] = df1[col].replace(cond1[col])
    df2[col] = df2[col].replace(cond2[col])

for col in cont_cols:
    df1[col] = df1[col].replace(cond1[col])
    df2[col] = df2[col].replace(cond2[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[col] = df1[col].replace(cond1[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].replace(cond2[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[col] = df1[col].replace(cond1[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [9]:
combined_df = pd.concat([df1, df2], axis=0)

In [10]:
combined_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,0.036565,77516,0.127403,13,0.413134,0.132618,0.302711,0.839670,0.607786,2174,0,40,0.906494,<=50K
1,50,0.074162,83311,0.127403,13,0.331817,0.087320,0.291128,0.839670,0.607786,0,0,13,0.906494,<=50K
2,38,0.786247,215646,0.358438,9,0.163362,0.052695,0.302711,0.839670,0.607786,0,0,40,0.906494,<=50K
3,53,0.786247,234721,0.045843,7,0.331817,0.052695,0.291128,0.110424,0.607786,0,0,40,0.906494,<=50K
4,28,0.786247,338409,0.127403,13,0.331817,0.160269,0.033352,0.110424,0.392214,0,0,40,0.002668,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32539,71,0.656526,287372,0.037746,16,0.852582,0.256526,0.751362,0.905915,0.845258,0,0,10,0.933897,>50K
32545,39,0.077371,111499,0.032864,12,0.852582,0.065540,0.097653,0.905915,0.154742,0,0,20,0.933897,>50K
32554,53,0.656526,321865,0.119812,14,0.852582,0.253709,0.751362,0.905915,0.845258,0,0,40,0.933897,>50K
32557,40,0.656526,154374,0.212770,9,0.852582,0.031925,0.751362,0.905915,0.845258,0,0,40,0.933897,>50K


In [11]:
def create_split(df, rs):
    training_split = df.sample(frac = 0.67, random_state=rs)
    testing_split = df.drop(training_split.index)

    return (training_split, testing_split)

In [12]:
def LREval(rs):
    # Separate features and target variable
    X = combined_df.drop('income', axis=1)
    y = combined_df['income']

    train , test = create_split(combined_df,rs)
    # Split data into training and testing sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs,test_size=0.33)

    X_train = train.drop('income', axis=1)
    y_train = train['income']

    X_test = test.drop('income', axis=1)
    y_test = test['income']

    # Initialize logistic regression model
    model = LogisticRegression()

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    return accuracy,cm

In [13]:
res = [['Random State','TN','FP','FN','TP','Accuracy','Precision','Recall','F1-Score']]

In [14]:
for random_state in [80, 225, 2067, 4133, 6741, 9931, 58146, 79327, 3114652, 8538320]:
    curr_res = []
    acc , confmat = LREval(random_state)
    print(f"Accuracy: {acc}")
    print(f"TP: {confmat[0][0]}\t FN: {confmat[0][1]}\t FP: {confmat[1][0]}\t TN: {confmat[1][1]}")
    acc = (confmat[1][1] + confmat[0][0]) / (confmat[1][1] + confmat[0][1] + confmat[0][0] + confmat[1][0])
    prec = confmat[0][0] / (confmat[1][0] + confmat[0][0])
    rec = confmat[0][0] / (confmat[0][1] + confmat[0][0])
    f1 = 2 / (1 / prec + 1 / rec)
    print(f"Accuracy: {acc}")
    print(f"Precision: {prec}")
    print(f"Recall: {rec}")
    print(f"F1-Score: {f1}")
    curr_res.append(random_state)
    curr_res.append(confmat[0][0])
    curr_res.append(confmat[0][1])
    curr_res.append(confmat[1][0])
    curr_res.append(confmat[1][1])
    curr_res.append(acc)
    curr_res.append(prec)
    curr_res.append(rec)
    curr_res.append(f1)
    res.append(curr_res)
    print("\n\n")
    

Accuracy: 0.8006514657980456
TP: 7896	 FN: 281	 FP: 1861	 TN: 707
Accuracy: 0.8006514657980456
Precision: 0.8092651429742749
Recall: 0.9656353185764951
F1-Score: 0.8805620608899298



Accuracy: 0.8003722661703118
TP: 7941	 FN: 259	 FP: 1886	 TN: 659
Accuracy: 0.8003722661703118
Precision: 0.8080797801974153
Recall: 0.9684146341463414
F1-Score: 0.8810118156099185



Accuracy: 0.7960912052117264
TP: 7843	 FN: 276	 FP: 1915	 TN: 711
Accuracy: 0.7960912052117264
Precision: 0.803750768600123
Recall: 0.9660056657223796
F1-Score: 0.8774402864015214



Accuracy: 0.7921824104234527
TP: 7836	 FN: 245	 FP: 1988	 TN: 676
Accuracy: 0.7921824104234527
Precision: 0.7976384364820847
Recall: 0.9696819700532112
F1-Score: 0.8752862328958391



Accuracy: 0.795067473243369
TP: 7870	 FN: 281	 FP: 1921	 TN: 673
Accuracy: 0.795067473243369
Precision: 0.8037994076192422
Recall: 0.9655257023678077
F1-Score: 0.8772712072232751



Accuracy: 0.7942298743601676
TP: 7843	 FN: 293	 FP: 1918	 TN: 691
Accuracy: 0.79422

In [15]:
# open a CSV file for writing
with open('LRResults.csv', 'w', newline='') as file:
    # create a CSV writer object
    writer = csv.writer(file)
    
    # write the data to the file
    for row in res:
        writer.writerow(row)