<a href="https://colab.research.google.com/github/omaralsabbah/Python-/blob/main/census_income_data_classification_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The goal of this project is to predict where the income of an individual exceeds 50K per year or not , depending on census data 


In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [None]:
#importing data sets 

data=pd.read_csv("/content/adult.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
#initial exploration of the data 
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


(32561, 15)

no null values are found in the data set 

In [None]:
#showing summary for the numerical values in the data set 
data.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
#income is our target 
#showing distribution of [income] in our data 
print(data["income"].value_counts())
print(data["income"].value_counts(normalize=True))

<=50K    24720
>50K      7841
Name: income, dtype: int64
<=50K    0.75919
>50K     0.24081
Name: income, dtype: float64


In [None]:
data["sex"].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

In [None]:
data["workclass"].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

 null values are noticed in the data set as ( " ? " )

In [None]:
#finding null values in all columns 
for c in data.columns:

  print("{}:".format(c),data[c].isin(["?"]).sum())


age: 0
workclass: 1836
fnlwgt: 0
education: 0
education.num: 0
marital.status: 0
occupation: 1843
relationship: 0
race: 0
sex: 0
capital.gain: 0
capital.loss: 0
hours.per.week: 0
native.country: 583
income: 0


In [None]:
data.replace("?",np.NaN,inplace=True)

In [None]:
data.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

deleting these null values since they are categorical data and it is hard to fill or replace them 


In [None]:
data=data.dropna()
data.shape

(30162, 15)

In [None]:
data.isna().sum()
# now we do not have null values 

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [None]:
#deleting some of the features that are not useful 
data=data.drop(["education","fnlwgt"],axis=1)
data.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K


education is not needed since we have educaiton.num columns and it is enough and " fnlwgt " is a very discrete feature and may confuse the model it is not an understood feature 

In [None]:
#data preprocessing
Numerical=["age","education.num","capital.gain","capital.loss","hours.per.week"]
Categorical = ["workclass","marital.status","occupation","relationship","race","sex","native.country"]

data_encoded=pd.get_dummies(data,columns=Categorical,drop_first=True)

X=data_encoded.drop("income",axis=1)
y=data_encoded["income"]
y.replace("<=50K","1",inplace=True)
y.replace(">50K","0",inplace=True)



In [None]:
#splitting the dataset into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1)
y_test.shape, y_train.shape

((7541,), (22621,))

In [None]:
#scaling numerical values

Sc= StandardScaler()
X_train[Numerical]=Sc.fit_transform(X_train[Numerical])
X_test[Numerical]=Sc.transform(X_test[Numerical])

In [None]:
#defining the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

forest = RandomForestClassifier()
forest.fit(X_train,y_train)

y_pred=forest.predict(X_test)
y_pred

array(['1', '0', '1', ..., '0', '1', '1'], dtype=object)

In [None]:
#evaluating first model
print("Accuracy Score =",accuracy_score(y_test,y_pred))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("f1_score=",f1_score(y_test,y_pred,pos_label="1"))

Accuracy Score = 0.8485611987800027
confusion matrix:
 [[1228  673]
 [ 469 5171]]
f1_score= 0.9005572971090213


In [None]:
KNN=KNeighborsClassifier(n_neighbors=15,p=2,metric="minkowski")
KNN.fit(X_train, y_train)
y_pred2=KNN.predict(X_test)

In [None]:
#evaluating KNN model
print("Accuracy Score =",accuracy_score(y_test,y_pred2))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred2))
print("f1_score=",f1_score(y_test,y_pred2,pos_label="1"))

Accuracy Score = 0.8436546877072006
confusion matrix:
 [[1196  705]
 [ 474 5166]]
f1_score= 0.8975762314308053


In [None]:
LR=LogisticRegression(solver='lbfgs', max_iter=1000)
LR.fit(X_train,y_train)
y_pred3=LR.predict(X_test)

In [None]:
#evaluating LR model
print("Accuracy Score =",accuracy_score(y_test,y_pred3))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred3))
print("f1_score=",f1_score(y_test,y_pred3,pos_label="1"))

Accuracy Score = 0.8482959819652566
confusion matrix:
 [[1157  744]
 [ 400 5240]]
f1_score= 0.9015829318651066
