# <center> Adult Dataset-ML Part </center>

### INDEX
<ul>
 <li><a href="#ML">Machine Learning </a></li>
</ul>

<a id="ML"> </a>
## Machine Learning

In [1]:
# importing needed libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import confusion_matrix 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
df=pd.read_csv('../data/processed/Adult_Salries_processed.csv')
df

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male,native-country_United-States,income_>50K
0,25,226802,7,0,0,40,0,1,0,0,...,1,0,0,0,1,0,0,1,1,0
1,38,89814,9,0,0,50,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
2,28,336951,12,0,0,40,1,0,0,0,...,0,0,0,0,0,0,1,1,1,1
3,44,160323,10,7688,0,40,0,1,0,0,...,0,0,0,0,1,0,0,1,1,1
4,34,198693,6,0,0,30,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44569,27,257302,12,0,0,38,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
44570,40,154374,9,0,0,40,0,1,0,0,...,0,0,0,0,0,0,1,1,1,1
44571,58,151910,9,0,0,40,0,1,0,0,...,0,1,0,0,0,0,1,0,1,0
44572,22,201490,9,0,0,20,0,1,0,0,...,1,0,0,0,0,0,1,1,1,0


In [3]:
x=df.drop('income_>50K',axis=1)
y=df['income_>50K']

In [4]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [5]:
y_train.value_counts()

0    26794
1     8865
Name: income_>50K, dtype: int64

### so we should apply oversampling to improve model learning

In [6]:
# Perform oversampling

smote = SMOTE()
x_over_resampled, y_over_resampled = smote.fit_resample(x, y)

In [7]:
x_train_over_samp,x_test_over_samp,y_train_over_samp,y_test_over_samp=train_test_split(x_over_resampled,y_over_resampled,test_size=0.2)

In [8]:
y_train_over_samp.value_counts()

0    26805
1    26798
Name: income_>50K, dtype: int64

### Applying OverSampling

In [9]:
scaler=StandardScaler()

In [10]:
x_train_over_samp=scaler.fit_transform(x_train_over_samp)
x_test_over_samp=scaler.transform(x_test_over_samp)

In [11]:
models={
    'lr':LogisticRegression(),
    'KNN':KNeighborsClassifier(n_neighbors=3),
    'NB':GaussianNB(),
    'dt':DecisionTreeClassifier(),
    'RF':RandomForestClassifier(n_estimators=13),
    'xgb':XGBClassifier(n_estimators=10)
}

In [12]:
#over_sampling
for name,model in models.items():
    print(name)
    model.fit(x_train_over_samp,y_train_over_samp)
    y_pred=model.predict(x_train_over_samp)
    print('train_conf',confusion_matrix(y_train_over_samp,y_pred))
    print('train_conf',confusion_matrix(y_test_over_samp,model.predict(x_test_over_samp)))
    print('acc_train',accuracy_score(y_train_over_samp,y_pred))
    print('acc_test',accuracy_score(y_test_over_samp,model.predict(x_test_over_samp)))
    print('-'*30)

lr
train_conf [[24341  2464]
 [ 3326 23472]]
train_conf [[6080  617]
 [ 817 5887]]
acc_train 0.8919836576311028
acc_test 0.8929930602193866
------------------------------
KNN
train_conf [[25150  1655]
 [ 2341 24457]]
train_conf [[5884  813]
 [ 937 5767]]
acc_train 0.9254519336604294
acc_test 0.8694127303932543
------------------------------
NB
train_conf [[19340  7465]
 [ 1678 25120]]
train_conf [[4806 1891]
 [ 426 6278]]
acc_train 0.8294311885528795
acc_test 0.8271024550406686
------------------------------
dt
train_conf [[26805     0]
 [    4 26794]]
train_conf [[5718  979]
 [ 873 5831]]
acc_train 0.9999253773109714
acc_test 0.8618013581076039
------------------------------
RF
train_conf [[26718    87]
 [  143 26655]]
train_conf [[6023  674]
 [ 829 5875]]
acc_train 0.9957091953808556
acc_test 0.8878441907320349
------------------------------
xgb
train_conf [[23743  3062]
 [ 2868 23930]]
train_conf [[5901  796]
 [ 740 5964]]
acc_train 0.8893718635151018
acc_test 0.8853816879337363
---

In [13]:
from sklearn.pipeline import Pipeline

# Create a pipeline with custom step names
pipe = Pipeline([('scaler--', StandardScaler()), ('model--', LogisticRegression())])


In [14]:
pipe.fit(x_train_over_samp, y_train_over_samp)

In [15]:
y_pred = pipe.predict(x_test_over_samp)

In [16]:
accuracy = accuracy_score(y_test_over_samp, y_pred)
accuracy

0.8929930602193866

In [17]:
import joblib 
joblib.dump(pipe,'pipeline--.h5')

['pipeline--.h5']