In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix


In [2]:
data = pd.read_csv('adult.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


В данных много категорийных признаков, которые необходимо преобразовать для использования в модели

#### 1. Workclass

In [4]:
data['workclass'].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [5]:
# Заменим значения "?" на "UnknownWC"
def fix_workclass(row):
    if (row['workclass'] == '?'):
        return 'UnknownWC'
    else:
        return row['workclass']

        
data['workclass'] = data.apply(fix_workclass, axis=1)


data = pd.get_dummies( data, columns = [ 'workclass' ] )

#### 2. Education

In [6]:
data = pd.get_dummies( data, columns = [ 'education' ] )

#### 3. Marital-status

In [7]:
data = pd.get_dummies( data, columns = [ 'marital-status' ] )

#### 4.Occupation

In [8]:
data['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [9]:
# Заменим значения "?" на "UnknownOcc"
def fix_occupation(row):
    if (row['occupation'] == '?'):
        return 'UnknownOcc'
    else:
        return row['occupation']

        
data['occupation'] = data.apply(fix_occupation, axis=1)

data = pd.get_dummies( data, columns = [ 'occupation'] )

#### 5.Relationship

In [10]:
data = pd.get_dummies( data, columns = [ 'relationship' ] )

#### 6.Race

In [11]:
data = pd.get_dummies( data, columns = [ 'race' ] )

#### 7.Gender

In [12]:
data = pd.get_dummies( data, columns = [ 'gender' ] )

#### 8.Native-country

In [13]:
data['native-country'].value_counts()

United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                        

In [14]:
# У нас слишком много стран, для которых есть небольшое количество значений, что может помешать работе модели
# Сгруппируем страны в регионы с целью уменьшить число параметров при сохранении большей части информации
latam = ['Mexico', 'Puerto-Rico', 'El-Salvador', 'Cuba', 'Jamaica', 'Dominican-Republic', 'Guatemala', 'Columbia', 'Haiti', 'Nicaragua', 'Peru', 'Ecuador', 'Trinadad&Tobago', 'Honduras']
aspac = ['Philippines', 'India', 'China', 'Japan', 'Vietnam', 'Taiwan', 'Hong', 'Thailand', 'Cambodia', 'Laos', 'Iran']
w_europe = ['Germany', 'England', 'Italy', 'Portugal', 'France', 'Ireland', 'Scotland', 'Holand-Netherlands']
e_europe = ['Poland', 'Hungary', 'Greece', 'Yugoslavia']

def get_region(row):
    if row['native-country'] in latam:
        return 'LatAmerica'
    elif row['native-country'] in aspac:
        return 'Asia'
    elif row['native-country'] in w_europe:
        return 'WestEurope'
    elif row['native-country'] in e_europe:
        return 'EastEurope'
    elif row['native-country'] == '?':
        return 'UnknownReg'
    elif row['native-country'] == 'United-States':
        return 'US'
    elif row['native-country'] == 'Canada':
        return 'Canada'
    elif row['native-country'] == 'Outlying-US(Guam-USVI-etc)':
        return 'US'
    elif row['native-country'] == 'South':
        return 'South'

data['region'] = data.apply(get_region, axis=1)
data['region'].value_counts()

US            43855
LatAmerica     2072
Asia            981
UnknownReg      857
WestEurope      602
Canada          182
EastEurope      178
South           115
Name: region, dtype: int64

In [15]:
data = pd.get_dummies( data, columns = [ 'region' ] )

#### 9.Income

In [16]:
data['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [17]:
# Income - это наша целевая переменная, заменим ее на одно новое поле
def get_income(row):
    if row['income'] == '>50K':
        return 1
    else:
        return 0
data['high_income'] = data.apply(get_income, axis=1)
data.drop(['income'], axis=1, inplace=True)

Уберем поля, которые мы не будем использовать в модели

In [18]:
data.drop(['fnlwgt'], axis=1, inplace=True)
data.drop(['capital-gain'], axis=1, inplace=True)
data.drop(['capital-loss'], axis=1, inplace=True)
data.drop(['educational-num'], axis=1, inplace=True)
data.drop(['native-country'], axis=1, inplace=True)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 71 columns):
 #   Column                                Non-Null Count  Dtype
---  ------                                --------------  -----
 0   age                                   48842 non-null  int64
 1   hours-per-week                        48842 non-null  int64
 2   workclass_Federal-gov                 48842 non-null  uint8
 3   workclass_Local-gov                   48842 non-null  uint8
 4   workclass_Never-worked                48842 non-null  uint8
 5   workclass_Private                     48842 non-null  uint8
 6   workclass_Self-emp-inc                48842 non-null  uint8
 7   workclass_Self-emp-not-inc            48842 non-null  uint8
 8   workclass_State-gov                   48842 non-null  uint8
 9   workclass_UnknownWC                   48842 non-null  uint8
 10  workclass_Without-pay                 48842 non-null  uint8
 11  education_10th                        488

Разделим данные для обучения и теста

In [20]:
X = data.drop('high_income', axis=1)  
y = data['high_income'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 2020, stratify=y)

In [21]:
model = LogisticRegression()

model.fit(X, y)
print(model.score(X_train, y_train))

0.834537424317763


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
y_pred = model.predict(X_test)

conf_matrix_baseline = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])
display(conf_matrix_baseline)
display('Logistic Regression recall score', recall_score(y_test, y_pred))        

Unnamed: 0,predicted 0,predicted 1
actual 0,10333,814
actual 1,1574,1932


'Logistic Regression recall score'

0.5510553337136338

In [23]:
from sklearn import svm

model_SVM = svm.SVC(kernel='linear')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 2020, stratify=y)

In [25]:
model_SVM.fit(X, y)
print(model_SVM.score(X_train, y_train))

0.8351809061394015


In [26]:
y_pred = model_SVM.predict(X_test)

conf_matrix_SVM = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])
display(conf_matrix_SVM)
display('SVM recall score', recall_score(y_test, y_pred))

Unnamed: 0,predicted 0,predicted 1
actual 0,10372,775
actual 1,1617,1889


'SVM recall score'

0.5387906446092413