In [6]:
import numpy as np # numerical calc package
import pandas as pd # holds data
import matplotlib.pyplot as plt # plotting library
import seaborn as sns # pretty plotting

# plotting config
sns.set(style='white', rc={'figure.figsize':(20,10)})

from sklearn.model_selection import train_test_split # split dataset
from sklearn.linear_model import LogisticRegression # linear model for classification
from sklearn.metrics import confusion_matrix

In [7]:
data = pd.read_csv("C:/Users/Asus/Desktop/census.csv")

In [8]:
data.shape

(45222, 14)

In [9]:
data.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 14 columns):
age                45222 non-null int64
workclass          45222 non-null object
education_level    45222 non-null object
education-num      45222 non-null float64
marital-status     45222 non-null object
occupation         45222 non-null object
relationship       45222 non-null object
race               45222 non-null object
sex                45222 non-null object
capital-gain       45222 non-null float64
capital-loss       45222 non-null float64
hours-per-week     45222 non-null float64
native-country     45222 non-null object
income             45222 non-null object
dtypes: float64(4), int64(1), object(9)
memory usage: 4.8+ MB


In [11]:
!pip install pandas_profiling



In [12]:
import pandas_profiling as pp

In [13]:
report = pp.ProfileReport(data)

In [14]:
report.to_file('census_data_assessement.html')

In [15]:
#Convert 'income' from object to numeric
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
data["inc"] = lb_make.fit_transform(data["income"])
data[["income", "inc"]].head(11)

Unnamed: 0,income,inc
0,<=50K,0
1,<=50K,0
2,<=50K,0
3,<=50K,0
4,<=50K,0
5,<=50K,0
6,<=50K,0
7,>50K,1
8,>50K,1
9,>50K,1


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 15 columns):
age                45222 non-null int64
workclass          45222 non-null object
education_level    45222 non-null object
education-num      45222 non-null float64
marital-status     45222 non-null object
occupation         45222 non-null object
relationship       45222 non-null object
race               45222 non-null object
sex                45222 non-null object
capital-gain       45222 non-null float64
capital-loss       45222 non-null float64
hours-per-week     45222 non-null float64
native-country     45222 non-null object
income             45222 non-null object
inc                45222 non-null int32
dtypes: float64(4), int32(1), int64(1), object(9)
memory usage: 5.0+ MB


In [17]:
#Creating dummies
gender = pd.get_dummies(data['sex'])
gender

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0
...,...,...
45217,0,1
45218,1,0
45219,0,1
45220,0,1


In [18]:
#Creating dummies
educ_level = pd.get_dummies(data['education_level'])
educ_level

Unnamed: 0,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
45218,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
45219,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
45220,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [19]:
#Creating dummies
status = pd.get_dummies(data['marital-status'])
status

Unnamed: 0,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,Separated,Widowed
0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
45217,0,0,0,0,1,0,0
45218,1,0,0,0,0,0,0
45219,0,0,1,0,0,0,0
45220,1,0,0,0,0,0,0


In [20]:
#Creating dummies
occupation = pd.get_dummies(data['occupation'])
occupation

Unnamed: 0,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,0,0,0,0,0,0,0,0,0,1,0,0,0,0
45218,0,0,0,0,0,0,0,0,0,1,0,0,0,0
45219,0,0,0,0,0,0,0,0,0,1,0,0,0,0
45220,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
#Creating dummies
race = pd.get_dummies(data['race'])
race

Unnamed: 0,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,1,0,0
4,0,0,1,0,0
...,...,...,...,...,...
45217,0,0,0,0,1
45218,0,0,0,0,1
45219,0,0,0,0,1
45220,0,1,0,0,0


In [22]:
#Creating dummies
workclass = pd.get_dummies(data['workclass'])
workclass

Unnamed: 0,Federal-gov,Local-gov,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
45217,0,0,1,0,0,0,0
45218,0,0,1,0,0,0,0
45219,0,0,1,0,0,0,0
45220,0,0,1,0,0,0,0


In [34]:
#Merge datasets
new_data = pd.concat([data, gender, educ_level, status, occupation, race, workclass], axis = 1, sort = False)
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 66 columns):
age                       45222 non-null int64
workclass                 45222 non-null object
education_level           45222 non-null object
education-num             45222 non-null float64
marital-status            45222 non-null object
occupation                45222 non-null object
relationship              45222 non-null object
race                      45222 non-null object
sex                       45222 non-null object
capital-gain              45222 non-null float64
capital-loss              45222 non-null float64
hours-per-week            45222 non-null float64
native-country            45222 non-null object
income                    45222 non-null object
inc                       45222 non-null int32
 Female                   45222 non-null uint8
 Male                     45222 non-null uint8
 10th                     45222 non-null uint8
 11th                     45

In [31]:
new_data.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,...,Black,Other,White,Federal-gov,Local-gov,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,...,0,0,1,0,0,0,0,0,1,0
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,0,0,1,0,0,0,0,1,0,0
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,...,0,0,1,0,0,1,0,0,0,0
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,...,1,0,0,0,0,1,0,0,0,0
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,...,1,0,0,0,0,1,0,0,0,0


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 15 columns):
age                45222 non-null int64
workclass          45222 non-null object
education_level    45222 non-null object
education-num      45222 non-null float64
marital-status     45222 non-null object
occupation         45222 non-null object
relationship       45222 non-null object
race               45222 non-null object
sex                45222 non-null object
capital-gain       45222 non-null float64
capital-loss       45222 non-null float64
hours-per-week     45222 non-null float64
native-country     45222 non-null object
income             45222 non-null object
inc                45222 non-null int32
dtypes: float64(4), int32(1), int64(1), object(9)
memory usage: 5.0+ MB


In [40]:
age = data['age']
educnum = data['education-num']
hours = data['hours-per-week']

In [43]:
# write list of features we want to use
features = [age,educnum,hours,gender]

# Set X
X = features


AttributeError: 'list' object has no attribute 'head'

In [None]:
# Set y
y = data["inc"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

In [None]:
# initialize model
log_reg = LogisticRegression()

In [None]:
# fit model
log_reg.fit(X_train, y_train)

In [None]:
# metrics (accuracy)
acc = log_reg.score(X_test, y_test)

In [None]:
# predictions
y_pred = log_reg.predict(X_test)

In [None]:
# accuracy: of all selected, how many are correct
acc

In [None]:
# confusion matrix
cf = confusion_matrix(y_test, y_pred)
cf

In [None]:
# heatmap of confusion matrix 
sns.heatmap(cf, annot = True)

In [None]:
# Assign test data into new variable
test_data = X_test

# Create column to insert predictions
test_data['predictions'] = y_pred

In [None]:
# Describe why the 
test_data[test_data['predictions']==0].describe()

In [None]:
test_data[test_data['predictions']==1].describe()

In [None]:
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from inspect import signature

In [None]:
#Precision: how many from the predicted items are relevant
precision_score(y_test, y_pred)

In [None]:
#Recall: how many relevant items were selected
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
precision, recall, threshold = precision_recall_curve(y_test, y_pred)
average_precision = average_precision_score(y_test, y_pred)
step_kwargs = ({'step': 'post'} if 'step' in signature(plt.fill_between).parameters else {})
plt.step(recall, precision, color='r', alpha=0.2, where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r', **step_kwargs)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))