In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from keras.losses import categorical_crossentropy
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

from google.colab import files

import warnings
warnings.filterwarnings('ignore')

In [None]:
census = pd.read_csv('/content/census_data_What is Tensorflow.csv')
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
census['income_bracket'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [None]:
census['income_bracket'] = census['income_bracket'].replace({' <=50K':0, ' >50K':1})
census['income_bracket'].unique()

array([0, 1])

In [None]:
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [None]:
census.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [None]:
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   education       32561 non-null  object
 3   education_num   32561 non-null  int64 
 4   marital_status  32561 non-null  object
 5   occupation      32561 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   gender          32561 non-null  object
 9   capital_gain    32561 non-null  int64 
 10  capital_loss    32561 non-null  int64 
 11  hours_per_week  32561 non-null  int64 
 12  native_country  32561 non-null  object
 13  income_bracket  32561 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 3.5+ MB


In [None]:
feat_cols = ['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country']

In [None]:
label_encoders = {}
for column in census.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    census[column] = label_encoders[column].fit_transform(census[column])

In [None]:
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,7,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,9,13,2,10,5,2,0,0,0,40,5,0


In [None]:
X = census.drop(columns='income_bracket',axis=1)
y = census['income_bracket']

In [None]:
X_copy = X.copy()
y_copy = y.copy()

In [None]:
X_copy.shape, y_copy.shape

((32561, 13), (32561,))

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((22792, 13), (9769, 13), (22792,), (9769,))

In [None]:
# Standardize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Model Creation using keras

In [None]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=2000, batch_size=128, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print(f'Test accuracy: {test_acc}')


Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

## Predicition


In [None]:
# Make predictions on the test data
predictions = model.predict(X_test_scaled)
predicted_labels = (predictions > 0.5).astype(int)

# Generate classification report
report = classification_report(y_test, predicted_labels)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.92      0.90      7436
           1       0.70      0.63      0.67      2333

    accuracy                           0.85      9769
   macro avg       0.80      0.78      0.78      9769
weighted avg       0.84      0.85      0.85      9769



In [None]:
X_copy_scaled = scaler.transform(X_copy)

In [None]:
# Make predictions on the test data
pred = model.predict(X_copy_scaled)
pred_labels = (pred > 0.5).astype(int).flatten()
pred_df = pd.DataFrame({'Prediction': pred_labels})

# Generate classification report
Xreport = classification_report(y_copy, pred_labels)
print("Classification Report:\n", Xreport)

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.92     24720
           1       0.77      0.68      0.72      7841

    accuracy                           0.87     32561
   macro avg       0.84      0.81      0.82     32561
weighted avg       0.87      0.87      0.87     32561



In [None]:
merged_df = pd.merge(census, pred_df,left_index=True,right_index=True)
merged_df.sample(10)

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket,Prediction
15042,30,4,11,9,4,1,4,4,1,0,0,40,39,0,0
24519,37,4,8,11,0,10,1,4,0,0,0,40,39,0,0
20717,24,2,9,13,4,10,1,4,1,0,0,35,39,0,0
20974,25,4,11,9,4,7,3,4,1,0,0,40,39,0,0
5863,64,4,12,14,4,10,1,4,0,0,0,35,39,0,0
18269,27,4,15,10,4,1,1,4,0,0,0,15,39,0,0
31002,19,4,15,10,4,12,3,1,1,0,0,35,25,0,0
28363,52,5,15,10,2,12,0,4,1,0,0,90,39,1,1
31752,37,4,15,10,2,12,5,4,0,0,0,40,39,1,1
8992,24,4,15,10,4,1,3,1,1,0,0,20,39,0,0


In [None]:
# Value counts for true income brackets
true_counts = merged_df['income_bracket'].value_counts()

# Value counts for predicted labels
predicted_counts = merged_df['Prediction'].value_counts()

# Compare the distributions
comparison_df = pd.DataFrame({
    'True Income Bracket': true_counts,
    'Predicted Label': predicted_counts
})

comparison_df['difference'] = true_counts - predicted_counts
comparison_df['diff_percentage'] = round((true_counts - predicted_counts)/(len(pred_df)) * 100,2)

# Print comparison DataFrame
print(comparison_df)


   True Income Bracket  Predicted Label  difference  diff_percentage
0                24720            25579        -859            -2.64
1                 7841             6982         859             2.64


## Saving the Model

In [None]:
model.save("census.h5")

In [None]:
# Load the model
loaded_model = tf.keras.models.load_model("census.h5")

loaded_model.compile(optimizer='adam',
                     loss=categorical_crossentropy,
                     metrics=['accuracy'])

# Now you can evaluate the loaded and recompiled model
loaded_model.evaluate(X_copy_scaled , y_copy)



[0.0, 0.8739289045333862]

In [None]:
files.download("census.h5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>