In [1]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.figure_factory as ff
import plotly.express as px

#To ignore warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv(r"C:\Users\amani\Downloads\salarydata.csv") # reading the dataset
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data.shape

(32561, 14)

In [4]:
data.isin(['?']).sum() # dataset contains meaningless values of '?' in certain columns

age                  0
workclass         1836
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
salary               0
dtype: int64

In [5]:
data = data.replace(to_replace='?', value=np.nan) # replacing fields having '?' with null values

In [6]:
# Making sure that the data does not contain unecessary spaces.
data=data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [7]:
# converting the target column into numerical classes (values of either 0 or 1).
data['salary'] = data['salary'].replace({'<=50K':0,'>50K':1})

In [8]:
uniq= pd.DataFrame(data.nunique(),columns=['Unique Values'])
uniq

Unnamed: 0,Unique Values
age,73
workclass,8
education,16
education-num,16
marital-status,7
occupation,14
relationship,6
race,5
sex,2
capital-gain,119


In [9]:
data.isna().sum()

age                  0
workclass         1836
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
salary               0
dtype: int64

In [10]:
Total = data.isnull().sum().sort_values(ascending=False) 
Percent = (data.isnull().sum()*100/data.isnull().count()).sort_values(ascending=False)   
missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])
missing_data

Unnamed: 0,Total,Percentage of Missing Values
occupation,1843,5.660146
workclass,1836,5.638647
native-country,583,1.790486
age,0,0.0
education,0,0.0
education-num,0,0.0
marital-status,0,0.0
relationship,0,0.0
race,0,0.0
sex,0,0.0


In [11]:
# numeric features
numeric_features = data.select_dtypes(include=np.number)
numeric_features.columns

Index(['age', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'salary'],
      dtype='object')

In [12]:
# categoric features
categoric_features = data.select_dtypes(include='object')
categoric_features.columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

In [13]:
# filling missing values
msv_col = ['workclass','occupation','native-country']
for col in msv_col:
    data[col] = data[col].fillna(data[col].mode()[0])

In [14]:
data.isna().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [15]:
# education & education number column are just the same, so dropping education number column.
data.drop(labels='education-num', axis=1, inplace=True)

In [16]:
# Transforming Maritial Status column with value as either married or not married.
data = data.replace({'Married-civ-spouse':'married','Married-AF-spouse':'married','Married-spouse-absent':'married',
                    'Never-married':'not married','Divorced':'not married','Separated':'not married','Widowed':'not married'})

In [17]:
from sklearn.preprocessing import LabelEncoder
column = ['workclass', 'education', 'marital-status', 'occupation','relationship', 'race', 'sex', 'native-country']
le = LabelEncoder()

In [18]:
for col in column:
    data[col] = le.fit_transform(data[col])
data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,6,9,1,0,1,4,1,2174,0,40,38,0
1,50,5,9,0,3,0,4,1,0,0,13,38,0
2,38,3,11,1,5,1,4,1,0,0,40,38,0
3,53,3,1,0,5,0,2,1,0,0,40,38,0
4,28,3,9,0,9,5,2,0,0,0,40,4,0


In [19]:
data.drop(['capital-gain','capital-loss'],axis=1,inplace=True)
data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,39,6,9,1,0,1,4,1,40,38,0
1,50,5,9,0,3,0,4,1,13,38,0
2,38,3,11,1,5,1,4,1,40,38,0
3,53,3,1,0,5,0,2,1,40,38,0
4,28,3,9,0,9,5,2,0,40,4,0


In [20]:
X=data.drop('salary',axis=1)
y=data['salary']

In [21]:
# Feature scaling on training data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X[['age',  'hours-per-week']])
input_scaled = scaler.transform(X[['age',  'hours-per-week']])
scaled_data = pd.DataFrame(input_scaled,columns=['age',  'hours-per-week'])

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

In [24]:
from sklearn.linear_model import LogisticRegression
#Defining Logistic Regression Model & fitting train data
lr=LogisticRegression()
logit_model=lr.fit(X_train,y_train)
#Predicting the result of test data using obtained model
y_pred_logit=logit_model.predict(X_test)

In [25]:
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score
precision_score(y_test,y_pred_logit)

0.5795053003533569

In [26]:
accuracy_score(y_test,y_pred_logit)

0.7795178873023184

In [27]:
X=data.drop('salary',axis=1)
y=data['salary']

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

In [29]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf = dt_clf.fit(X_train,y_train)
y_pred_dt = dt_clf.predict(X_test)

In [30]:
accuracy_score(y_test,y_pred_dt)

0.7744510978043913

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
model_rf=rf.fit(X_train,y_train)
y_pred_rf=model_rf.predict(X_test)

In [32]:
accuracy_score(y_test,y_pred_rf)

0.8160601873176724

In [33]:
### importing lazypredict library
import lazypredict
### importing LazyClassifier for classification problem
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric = None)
## fitting data in LazyClassifier
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)

100%|██████████| 29/29 [03:38<00:00,  7.55s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.84               0.75     0.75      0.83   
QuadraticDiscriminantAnalysis      0.75               0.75     0.75      0.77   
XGBClassifier                      0.83               0.75     0.75      0.83   
AdaBoostClassifier                 0.83               0.74     0.74      0.82   
RandomForestClassifier             0.82               0.73     0.73      0.81   
NearestCentroid                    0.70               0.72     0.72      0.72   
GaussianNB                         0.74               0.72     0.72      0.75   
SVC                                0.80               0.72     0.72      0.80   
ExtraTreesClassifier               0.81               0.72     0.72      0.80   
KNeighborsClassifier               0.80               0.71     0.71      0.80   
BaggingClassifier           




In [34]:
# fit model no training data
from numpy import loadtxt
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [35]:
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)


In [36]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [37]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.23%


In [38]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)
Y_pred=svc.predict(X_test)
print('Accuracy on training data is:',svc.score(X_train,y_train))
print('Accuracy is:',accuracy_score(y_test,y_pred))
print('Precision is:',precision_score(y_test,y_pred,average='weighted'))
print('Recall is:',recall_score(y_test,y_pred,average='weighted'))
print('f1 score is:',f1_score(y_test,y_pred,average='weighted'))

Accuracy on training data is: 0.7722665847665847
Accuracy is: 0.8323353293413174
Precision is: 0.8261164857643325
Recall is: 0.8323353293413174
f1 score is: 0.8281361684069845


In [39]:
from tabulate import tabulate
table = [['Model', 'Accuracy'], ['Logistic Regression',77.95],['Decision tree',77.64],['Random Forest',81.75],['XGB Classifier',83.99],['SVM',83.98]]
print(tabulate(table,headers='firstrow',tablefmt='fancy_grid'))

╒═════════════════════╤════════════╕
│ Model               │   Accuracy │
╞═════════════════════╪════════════╡
│ Logistic Regression │      77.95 │
├─────────────────────┼────────────┤
│ Decision tree       │      77.64 │
├─────────────────────┼────────────┤
│ Random Forest       │      81.75 │
├─────────────────────┼────────────┤
│ XGB Classifier      │      83.99 │
├─────────────────────┼────────────┤
│ SVM                 │      83.98 │
╘═════════════════════╧════════════╛


In [40]:
from sklearn.model_selection import GridSearchCV
# Creating the hyperparameter grid
param_grid = {'C': [1,10,100,1000]}
# Instantiating logistic regression classifier
logreg = LogisticRegression()
# Instantiating the GridSearchCV object
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
logreg_cv.fit(X_train, y_train)
logreg_cv.predict(X_test)
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameters: {'C': 1}
Best score is 0.7815183303370181


In [42]:
from sklearn.model_selection import GridSearchCV
# Creating the hyperparameter grid
param_grid = {'C': [1,10,100,1000]}
# Instantiating SVM classifier
svm_clf = SVC()
# Instantiating the GridSearchCV object
svm_cv = GridSearchCV(svm_clf, param_grid, cv = 5)
svm_cv.fit(X_train, y_train)
svm_cv.predict(X_test)
# Print the tuned parameters and score
print("Tuned SVM Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

Tuned SVM Parameters: {'C': 1}
Best score is 0.7815183303370181
