# IMPORTING THE LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# READING THE DATASET:

In [2]:
adult=pd.read_csv('C:\\Users\\Naveen\\Downloads\\adult dataset.csv')
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


# FINDING THE NULL VALUES

In [3]:
adult.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

# Cleaning Data

In [4]:
adult.replace('?', np.nan, inplace=True)

# FILLING THE NULL VALUES

In [5]:
ad=adult.fillna(method="bfill")

# Applying Label encoder

In [6]:
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()

In [7]:
for col in ad.columns:
    if ad[col].dtypes == 'object':
        ad[col] = le.fit_transform(ad[col])

Labelencoder is used to convert the catagorical data into Numeral data.

In [8]:
ad.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,3,77053,11,9,6,3,1,4,0,0,4356,40,38,0
1,82,3,132870,11,9,6,3,1,4,0,0,4356,18,38,0
2,66,3,186061,15,10,6,6,4,2,0,0,4356,40,38,0
3,54,3,140359,5,4,0,6,4,4,0,0,3900,40,38,0
4,41,3,264663,15,10,5,9,3,4,0,0,3900,40,38,0


# DROPING THE UNNECESSARY COLUMNS:

In [9]:
p=ad.drop(['fnlwgt'],axis=1)
p.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,3,11,9,6,3,1,4,0,0,4356,40,38,0
1,82,3,11,9,6,3,1,4,0,0,4356,18,38,0
2,66,3,15,10,6,6,4,2,0,0,4356,40,38,0
3,54,3,5,4,0,6,4,4,0,0,3900,40,38,0
4,41,3,15,10,5,9,3,4,0,0,3900,40,38,0


fnlwgt appears to be a highly dispersive. In fact, it is a weight on the Current Population Survey (CPS) files. We will not incorporate fnlwgt in set of predictor variables, we drop the fnlwgt column

# NORMALIZING THE DATA:

In [10]:
from sklearn.preprocessing import Normalizer
n=Normalizer()

In [11]:
p[['capital.gain','capital.loss']]=n.fit_transform(p[['capital.gain','capital.loss']])

When you normalize a feature, all feature values will be in the range of 0 to 1.

In [12]:
p.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,3,11,9,6,3,1,4,0,0.0,1.0,40,38,0
1,82,3,11,9,6,3,1,4,0,0.0,1.0,18,38,0
2,66,3,15,10,6,6,4,2,0,0.0,1.0,40,38,0
3,54,3,5,4,0,6,4,4,0,0.0,1.0,40,38,0
4,41,3,15,10,5,9,3,4,0,0.0,1.0,40,38,0


# STORING THE VALUES IN X AND Y

In [13]:
x=p.iloc[:,0:12]
y=p.iloc[:,13]

In [14]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=0)


# Here we use the classification Algorithms because our  target variable is in the form of Discrete.

# APPLYING LOGISTIC REGRESSION

In [15]:
from sklearn.linear_model import LogisticRegression
# Logistic Regression
logreg = LogisticRegression(random_state=0)
logreg.fit(xtrain, ytrain)
ypred = logreg.predict(xtest)
score_logreg = logreg.score(xtest,ytest)
print('The accuracy of the Logistic Regression is', score_logreg)



The accuracy of the Logistic Regression is 0.8114443648275156


Logistic regression is used when your dependent variable is in the binary format.

# Confusion Matrix

In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, ypred)
cm

array([[6927,  483],
       [1359, 1000]], dtype=int64)

A confusion matrix is a summary of prediction results on a classification problem.

A perfect set of predictions is shown as a diagonal line from the top left to the bottom right of the matix

# Classification Report:

In [17]:
from sklearn.metrics import classification_report
print(classification_report(ypred,ytest))

              precision    recall  f1-score   support

           0       0.93      0.84      0.88      8286
           1       0.42      0.67      0.52      1483

   micro avg       0.81      0.81      0.81      9769
   macro avg       0.68      0.76      0.70      9769
weighted avg       0.86      0.81      0.83      9769



# APPLYING DECISIONTREE ALGORITHM

In [18]:
from sklearn.tree import DecisionTreeClassifier
dec=DecisionTreeClassifier(random_state=0)
dec.fit(xtrain,ytrain)
y_pred=dec.predict(xtest)
score=dec.score(xtest,ytest)
score

0.7902548879107381

A decision tree is built on an entire dataset, using all the features/vaiables.
In decision analysis, a decision tree can be used to visually and explicitly represent decisions and decision making. As the name goes, it uses a tree-like model of decisions.

# Confusion Matrix:

In [19]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pred)
cm

array([[6425,  985],
       [1064, 1295]], dtype=int64)

# Classification Report:

In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,ytest))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      7489
           1       0.55      0.57      0.56      2280

   micro avg       0.79      0.79      0.79      9769
   macro avg       0.71      0.71      0.71      9769
weighted avg       0.79      0.79      0.79      9769



# APPLYING RANDOMFOREST ALGORITHM

In [21]:
from sklearn.ensemble import RandomForestClassifier
ran=RandomForestClassifier(random_state=0)
ran.fit(xtrain,ytrain)
y_pre=ran.predict(xtest)
score=ran.score(xtest,ytest)
score



0.8207595455010749

Random forest randomly selects observations/rows and specific features /variables to build multiple decision trees from and then averages the result.
 Random forest runtimes are quite fast, and they are able to deal with unbalanced and missing data.

# Confusion Matrix:

In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pre)
cm

array([[6759,  651],
       [1100, 1259]], dtype=int64)

# Classification Report:

In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_pre,ytest))

              precision    recall  f1-score   support

           0       0.91      0.86      0.89      7859
           1       0.53      0.66      0.59      1910

   micro avg       0.82      0.82      0.82      9769
   macro avg       0.72      0.76      0.74      9769
weighted avg       0.84      0.82      0.83      9769



# KNN:

In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(xtrain, ytrain)
y = knn.predict(xtest)
score_knn = knn.score(xtest,ytest)
print('The accuracy of the KNN Model is',score_knn)

The accuracy of the KNN Model is 0.8036646534957519


 It simply calculates the distance of a new data point to all other training data points. The distance can be of any type e.g Euclidean or Manhattan etc.
    It is used to find out the nearest values.

# Confusion Matrix:

In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y)
cm

array([[6579,  831],
       [1087, 1272]], dtype=int64)

# Classification Report:

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y,ytest))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87      7666
           1       0.54      0.60      0.57      2103

   micro avg       0.80      0.80      0.80      9769
   macro avg       0.71      0.73      0.72      9769
weighted avg       0.81      0.80      0.81      9769



# SVM:

In [27]:
from sklearn.svm import SVC
sv=SVC(gamma='auto')
sv.fit(xtrain,ytrain)
yp=sv.predict(xtest)
score=sv.score(xtest,ytest)
score


0.8187122530453476

SVM is a supervised machine learning algorithm which can be used for classification or regression problems. It uses a technique called the kernel trick to transform your data and then based on these transformations it finds an optimal boundary between the possible outputs.

# Confusion Matrix:

In [28]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, yp)
cm

array([[6908,  502],
       [1269, 1090]], dtype=int64)

# Classification Report:

In [29]:
from sklearn.metrics import classification_report
print(classification_report(yp,ytest))

              precision    recall  f1-score   support

           0       0.93      0.84      0.89      8177
           1       0.46      0.68      0.55      1592

   micro avg       0.82      0.82      0.82      9769
   macro avg       0.70      0.76      0.72      9769
weighted avg       0.86      0.82      0.83      9769



# NAIVE BAYES THEOREM:

In [30]:
from sklearn.naive_bayes import GaussianNB
nv=GaussianNB()
nv.fit(xtrain,ytrain)
ypr=nv.predict(xtest)
score=nv.score(xtest,ytest)
score

0.8078616030299929

Naive Bayes uses a similar method to predict the probability of different class based on various attributes. This algorithm is mostly used in text classification and with problems having multiple classes.

# Confusion Matrix

In [31]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, ypr)
cm

array([[6498,  912],
       [ 965, 1394]], dtype=int64)

A confusion matrix is a summary of prediction results on a classification problem.

A perfect set of predictions is shown as a diagonal line from the top left to the bottom right of the matix

# Classification Report

In [32]:
from sklearn.metrics import classification_report
print(classification_report(ypr,ytest))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      7463
           1       0.59      0.60      0.60      2306

   micro avg       0.81      0.81      0.81      9769
   macro avg       0.73      0.74      0.74      9769
weighted avg       0.81      0.81      0.81      9769

