# Table Of Contents

# Import Libraries

In [177]:
# Basic Libraries
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
from dotenv import load_dotenv
import scipy.sparse
# MongoDB
from pymongo import MongoClient

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.compose import make_column_transformer

import certifi
ca = certifi.where()


#ignore warnings
import warnings
warnings.filterwarnings('ignore')


# Read Data From MongoDB

In [112]:
load_dotenv()
username = os.getenv('USER_NAME')
password = os.getenv('PASSWORD')

# connect to MongoDB
client = MongoClient('mongodb+srv://'+ username + ':' + password +'@mycluster.yrgvltw.mongodb.net/?retryWrites=true&w=majority', tlsCAFile=ca)

In [113]:
# database 
db = client['mydatabase']

# collection
collection = db['census_income_cleaned']

In [128]:
# read data from collection
collection_data=collection.find()

# convert collection data to DataFrame
df = pd.DataFrame(collection_data)

In [129]:
df.head()

Unnamed: 0,_id,index,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,salary
0,636e2a9edf2119b62c7140b3,0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,<=50K
1,636e2a9edf2119b62c7140b4,1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,<=50K
2,636e2a9edf2119b62c7140b5,2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,<=50K
3,636e2a9edf2119b62c7140b6,3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,<=50K
4,636e2a9edf2119b62c7140b7,4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,<=50K


In [130]:
# remove _id,level_0 and index
df = df.iloc[:,3:]

In [131]:
df.head()

Unnamed: 0,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,salary
0,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,<=50K
1,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,<=50K
2,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,<=50K
3,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,<=50K
4,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,<=50K


# Feature Engineering

## Split Features and Target

In [206]:
# split feature and target
X = df.iloc[:,:-1]
y = df['salary']

In [220]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [221]:
print('X_train shape : {}\nX_test shape : {}'.format(X_train.shape,X_test.shape))

X_train shape : (31635, 9)
X_test shape : (13559, 9)


## Feature Encoding

In [222]:
# Encode categoric values
# Scale numeric values
transformer = make_column_transformer(
    (StandardScaler(), [col for col in X_train.columns if df[col].dtype == 'int64']),
    (OneHotEncoder(sparse=False),[col for col in X_train.columns if df[col].dtype == object]),
     remainder='passthrough')


X_train = pd.DataFrame(transformer.fit_transform(X_train))
X_test = pd.DataFrame(transformer.transform(X_test))

In [223]:
logistic_reg=LogisticRegression()
logistic_reg.fit(X_train, y_train)
logistic_reg_pred=logistic_reg.predict(X_test)

In [224]:
confusion_mat=confusion_matrix(y_test, logistic_reg_pred)
confusion_mat

array([[9352,  840],
       [1478, 1889]], dtype=int64)

In [225]:
classification_rep_log_reg=classification_report(y_test, logistic_reg_pred)
print(classification_rep_log_reg)

              precision    recall  f1-score   support

       <=50K       0.86      0.92      0.89     10192
        >50K       0.69      0.56      0.62      3367

    accuracy                           0.83     13559
   macro avg       0.78      0.74      0.75     13559
weighted avg       0.82      0.83      0.82     13559

