# Классификация: Логистическая регрессия и SVM

In [139]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model

In [140]:
dataset = pd.read_csv('adult.csv')

### Сделаем обзор датасета:

In [141]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [142]:
dataset.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K


In [143]:
dataset['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [144]:
dataset['workclass'].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [145]:
dataset['education'].value_counts()

HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64

In [146]:
dataset['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [147]:
dataset['relationship'].value_counts()

Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: relationship, dtype: int64

In [148]:
dataset['race'].value_counts()

White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: race, dtype: int64

In [149]:
dataset['gender'].value_counts()

Male      32650
Female    16192
Name: gender, dtype: int64

In [150]:
dataset['capital-gain'].value_counts()

0        44807
15024      513
7688       410
7298       364
99999      244
         ...  
2387         1
22040        1
6612         1
1111         1
1639         1
Name: capital-gain, Length: 123, dtype: int64

In [151]:
dataset['capital-loss'].value_counts()

0       46560
1902      304
1977      253
1887      233
2415       72
        ...  
1539        1
2489        1
2201        1
1421        1
1870        1
Name: capital-loss, Length: 99, dtype: int64

In [152]:
dataset['hours-per-week'].value_counts()

40    22803
50     4246
45     2717
60     2177
35     1937
      ...  
69        1
94        1
79        1
82        1
87        1
Name: hours-per-week, Length: 96, dtype: int64

In [173]:
dataset['native-country'].value_counts()

United-States                 43832
Mexico                          951
Other                           857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                        

# Логичстическая регрессия:

### Следующий этап - это обработка пропущенных значений и кодирование категориальных признаков.

Нулевых значений в датасете нет. Но есть значения '?', что означает, что значения пропущены.

In [174]:
categorical_cols = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country']


In [177]:
# Заменим вопросы на категорию "Other"

In [178]:
for col in categorical_cols:
    dataset.loc[dataset[col] == '?', col] = 'Other'

In [179]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
LabEnc = LabelEncoder()

In [180]:
# Выделим целевую переменную income в переменную y и закодируем значениями 0 и 1:
LabEnc.fit( dataset['income'] )
LabEnc.classes_
LabEnc.transform( dataset['income'] )
y = pd.Series( data = LabEnc.transform(dataset['income'] ) )
y.head()

0    0
1    0
2    1
3    1
4    0
dtype: int64

Рассмотрим основной способ преоборазования категориальных признаков в вещественные: one-hot encoding. 

In [181]:
dataset.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K


Выделим все остальные признаки в X:

In [209]:
X = pd.get_dummies( dataset.drop('income', 1), columns = categorical_cols)

In [210]:
X.shape

(48842, 108)

In [212]:
X.head(3)

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Other,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


Так же стоит провести масштабирование вещественных признаков, так как у ряда признаков довольно большие значения

In [224]:
num_cols = list(set(dataset.columns) - set(categorical_cols) - set(['income']))

In [225]:
num_cols

['educational-num',
 'capital-loss',
 'hours-per-week',
 'fnlwgt',
 'capital-gain',
 'age']

In [405]:
X_num = X[num_cols]

In [406]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_num)
X_num_scaled = scaler.transform(X_num)

In [407]:
X_num_scaled = pd.DataFrame(data = X_num_scaled, columns = num_cols)

In [408]:
X_num_scaled.head(3)

Unnamed: 0,educational-num,capital-loss,hours-per-week,fnlwgt,capital-gain,age
0,-1.197259,-0.217127,-0.034087,0.351675,-0.144804,-0.995129
1,-0.419335,-0.217127,0.77293,-0.945524,-0.144804,-0.046942
2,0.74755,-0.217127,-0.034087,1.394723,-0.144804,-0.776316


In [433]:
X_cat = X[list(set(X.columns) - set(X_num_scaled.columns))]

In [434]:
dataset_modif = X_num_scaled

In [435]:
# Получаем датасет, признаки в котором выполняют все требования логистической регрессии:

In [436]:
result_X = X_cat.join(X_num_scaled, how="outer")

In [437]:
result_X.head(3)

Unnamed: 0,occupation_Armed-Forces,native-country_Philippines,education_Bachelors,native-country_Canada,education_HS-grad,occupation_Farming-fishing,marital-status_Separated,marital-status_Widowed,native-country_Greece,occupation_Other-service,...,native-country_Germany,occupation_Adm-clerical,education_5th-6th,education_1st-4th,educational-num,capital-loss,hours-per-week,fnlwgt,capital-gain,age
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,-1.197259,-0.217127,-0.034087,0.351675,-0.144804,-0.995129
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,-0.419335,-0.217127,0.77293,-0.945524,-0.144804,-0.046942
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.74755,-0.217127,-0.034087,1.394723,-0.144804,-0.776316


In [438]:
result_X.shape

(48842, 108)

### Необходимо разделить выборку на тренировочную и тестовую:

In [439]:
from sklearn.model_selection import train_test_split

(X_train, X_test,y_train, y_test) = train_test_split(result_X, y, test_size=0.3, random_state=0)

### Создадим и обучим модель:

In [440]:
model = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions = model.predict(X_test)
predictions_proba = model.predict_proba(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


###  Оценим точность модели:

In [441]:
model.score(X_test, y_test)

0.8511567597079096

In [442]:
# Попробуем уменьшить кол-во признаков с помощью Principal Component Analysis

In [443]:
from sklearn.decomposition import PCA
# pca - keep 99% of variance
pca = PCA(0.99)
principal_components = pca.fit_transform(result_X)
principal_df = pd.DataFrame(data = principal_components)
print(principal_df.shape)

(48842, 50)


In [444]:
from sklearn.model_selection import train_test_split

(X_train, X_test,y_train, y_test) = train_test_split(principal_df, y, test_size=0.3, random_state=0)

In [445]:
model = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions = model.predict(X_test)
predictions_proba = model.predict_proba(X_test)

In [446]:
model.score(X_test, y_test)

0.8512932505289019

Точность незначительно изменилась. Значит, нужно искать другие способы повышения точности (и обработки признаков)

 # SVM

In [455]:
from sklearn import svm

In [457]:
model = svm.SVC(random_state=0).fit(X_train, y_train)
predictions = model.predict(X_test)
#predictions_proba = model.predict_proba(X_test)

In [458]:
model.score(X_test, y_test)

0.8573670920630587

Видим, что на аналогичном датасете точность мало отличается. Нужны другие способы обработки данных, отбора признаков и тд