In [None]:
# !pip install sklearn xgboost

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings. filterwarnings('ignore')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
df = pd.read_csv('/content/drive/MyDrive/Machine Learning Course/Dataset.data', sep=" ", header=None, names=columns)

downloaded from [UCI](https://www.cs.toronto.edu/~delve/data/adult/desc.html)

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


Used some code snipets from [kaggle](https://www.kaggle.com/code/amirhosseinzinati/adult-income-k-nearest-neighbors-knn)

## Preprocess

In [4]:
def convert_marital_status(status):
    if status in ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']:
        return 'married'
    elif status in ['Never-married', 'Separated', 'Widowed']:
        return 'single'
    else:
        return 'divorced'

df['marital-status'] = df['marital-status'].apply(convert_marital_status)

df['native-country'] = df['native-country'].replace('Outlying-US(Guam-USVI-etc)' , 'US Minor Islands')

df = df.drop(['capital-gain', 'capital-loss', 'fnlwgt'], axis=1) # kept this because I don't want to ask the app user about these as most people don't know them

income_mapping = {'<=50K': 0, '>50K': 1}
df['class'] = df['class'].map(income_mapping)

In [5]:
def fill_missing_categorical(df, column):
    df[column] = df[column].replace('?', np.nan)

    if df[column].notna().all():
        return df

    known = df[df[column].notna()]
    unknown = df[df[column].isna()]

    le = LabelEncoder()
    known[column] = le.fit_transform(known[column])
    X_known = known.drop(column, axis=1)
    y_known = known[column]

    categorical_cols = X_known.select_dtypes(include=['object']).columns

    le_cat = LabelEncoder()
    X_known[categorical_cols] = X_known[categorical_cols].apply(lambda col: le_cat.fit_transform(col.astype(str)))

    # the writer used a classifier to fill/predict the empty parts
    clf = RandomForestClassifier()
    clf.fit(X_known, y_known)

    X_unknown = unknown.drop(column, axis=1)

    X_unknown[categorical_cols] = X_unknown[categorical_cols].apply(lambda col: le_cat.fit_transform(col.astype(str)))

    unknown[column] = clf.predict(X_unknown)

    df = pd.concat([known, unknown], axis=0)

    df[column] = le.inverse_transform(df[column])

    return df

In [6]:
df = fill_missing_categorical(df, 'native-country')
df = fill_missing_categorical(df, 'occupation')
df = fill_missing_categorical(df, 'workclass')

In [7]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,class
0,25,Private,11th,7,single,Machine-op-inspct,Own-child,Black,Male,40,United-States,0
1,38,Private,HS-grad,9,married,Farming-fishing,Husband,White,Male,50,United-States,0
2,28,Local-gov,Assoc-acdm,12,married,Protective-serv,Husband,White,Male,40,United-States,1
3,44,Private,Some-college,10,married,Machine-op-inspct,Husband,Black,Male,40,United-States,1
5,34,Private,10th,6,single,Other-service,Not-in-family,White,Male,30,United-States,0


In [8]:
# we drop the "education-num" and use "education" with an encoder instead
df = df.drop('education-num', axis=1)

In [9]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column '{column}': {unique_values}")

Column 'age': [25 38 28 44 34 63 24 55 65 36 26 48 43 20 37 45 22 23 54 32 46 56 17 29
 39 52 18 21 42 33 30 47 41 19 69 50 31 59 49 58 40 27 57 61 51 73 53 80
 62 35 72 64 68 66 60 67 71 70 90 77 81 74 78 82 75 85 76 89 83 79 88 87
 84 86]
Column 'workclass': ['Private' 'Local-gov' 'Self-emp-not-inc' 'Federal-gov' 'State-gov'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
Column 'education': ['11th' 'HS-grad' 'Assoc-acdm' 'Some-college' '10th' 'Prof-school'
 '7th-8th' 'Bachelors' 'Masters' '5th-6th' 'Assoc-voc' '9th' 'Doctorate'
 '12th' '1st-4th' 'Preschool']
Column 'marital-status': ['single' 'married' 'divorced']
Column 'occupation': ['Machine-op-inspct' 'Farming-fishing' 'Protective-serv' 'Other-service'
 'Prof-specialty' 'Craft-repair' 'Adm-clerical' 'Exec-managerial'
 'Tech-support' 'Sales' 'Priv-house-serv' 'Transport-moving'
 'Handlers-cleaners' 'Armed-Forces']
Column 'relationship': ['Own-child' 'Husband' 'Not-in-family' 'Unmarried' 'Wife' 'Other-relative']
Column 'race': ['Bla

## Fit the Model

In [10]:
X = df.drop('class', axis=1)
y = df['class']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10, random_state=1234)

In [12]:
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
noncategorical_cols = ['age', 'hours-per-week']

In [13]:
# Create and fit encoder
encoder = OneHotEncoder(sparse=False)
X_train_encoded = np.hstack((X_train[noncategorical_cols].values, encoder.fit_transform(X_train[categorical_cols])))
X_test_encoded = np.hstack((X_test[noncategorical_cols].values, encoder.transform(X_test[categorical_cols])))

# Create and fit scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [15]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(43957, 97)
(4885, 97)


In [16]:
model = XGBClassifier()
model.fit(X_train_scaled, y_train)

In [17]:
y_hat = model.predict(X_test_scaled)

In [18]:
y_hat.shape

(4885,)

In [19]:
y_test

Unnamed: 0,class
42007,0
42730,0
9600,1
18825,0
31961,0
...,...
8995,0
12046,0
47036,0
21806,0


In [20]:
precision_score(y_test, y_hat)

0.7106382978723405

## Export model, encoder and scaler

In [21]:
import joblib

In [22]:
joblib.dump((model, scaler, encoder), '/content/drive/MyDrive/Machine Learning Course/model_scaler_encoder.joblib')

['/content/drive/MyDrive/Practical Machine Learning Course/model_scaler_encoder.joblib']

## Test Import

In [23]:
model, scaler, encoder = joblib.load('/content/drive/MyDrive/Machine Learning Course/model_scaler_encoder.joblib')

In [24]:
X_test.columns

Index(['age', 'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'hours-per-week', 'native-country'],
      dtype='object')

In [25]:
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
noncategorical_cols = ['age', 'hours-per-week']

In [30]:
X_test_encoded = encoder.transform(X_test[categorical_cols])
X_test_scaled = scaler.transform(np.hstack((X_test[noncategorical_cols], X_test_encoded)))

In [31]:
yhat = model.predict(X_test_scaled)
yhat

array([0, 0, 1, ..., 0, 0, 0])

In [32]:
yhat.shape

(4885,)

In [33]:
!pip freeze

absl-py==1.4.0
accelerate==0.32.1
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
alabaster==0.7.16
albucore==0.0.13
albumentations==1.4.14
altair==4.2.2
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.18.0
asn1crypto==1.5.1
astropy==6.1.2
astropy-iers-data==0.2024.8.19.0.32.16
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==24.2.0
audioread==3.0.1
autograd==1.6.2
babel==2.16.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.15.0
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.4.3
bqplot==0.12.43
branca==0.7.2
build==1.2.1
CacheControl==0.14.0
cachetools==5.5.0
catalogue==2.0.10
certifi==2024.7.4
cffi==1.17.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
clarabel==0.9.0
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.18.1
cloudpickle==2.2.1
cmake==3.30.2
cmdstanpy==1.2.4
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confe