# Классификация: Логистическая регрессия и SVM

Имеются данные adult.csv.
Целевой переменной является уровень дохода income (крайний правый столбец).
Описание признаков можно найти по ссылке www.cs.toronto.edu...etail.html
Вам необходимо построить модели логистической регрессии и SVM, которые предсказывает уровень дохода человека.
Вывести качество полученных моделей на тестовой выборке, используя функцию score у модели.

In [122]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

In [123]:
data = pd.read_csv("adult_csv.csv", delimiter = ',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


In [124]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age               48842 non-null int64
workclass         46043 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        46033 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capitalgain       48842 non-null int64
capitalloss       48842 non-null int64
hoursperweek      48842 non-null int64
native-country    47985 non-null object
class             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [125]:
data[pd.isnull(data["workclass"])].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
27,3,,180211,Some-college,10,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,0,0,3,South,>50K
61,1,,293936,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,2,,<=50K
69,0,,200681,Some-college,10,Never-married,,Own-child,White,Male,0,0,2,United-States,<=50K
77,4,,212759,10th,6,Married-civ-spouse,,Husband,White,Male,0,0,0,United-States,<=50K
106,0,,304873,10th,6,Never-married,,Own-child,White,Female,4,0,1,United-States,<=50K


In [126]:
data[pd.isnull(data["occupation"])].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
27,3,,180211,Some-college,10,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,0,0,3,South,>50K
61,1,,293936,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,2,,<=50K
69,0,,200681,Some-college,10,Never-married,,Own-child,White,Male,0,0,2,United-States,<=50K
77,4,,212759,10th,6,Married-civ-spouse,,Husband,White,Male,0,0,0,United-States,<=50K
106,0,,304873,10th,6,Never-married,,Own-child,White,Female,4,0,1,United-States,<=50K


In [127]:
data[pd.isnull(data["native-country"])].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
14,2,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,2,,>50K
38,1,Private,84154,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,2,,>50K
51,0,Private,226956,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,1,,<=50K
61,1,,293936,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,2,,<=50K
93,1,Private,117747,HS-grad,9,Married-civ-spouse,Sales,Wife,Asian-Pac-Islander,Female,0,1,1,,<=50K


In [128]:
data = data[pd.isnull(data["workclass"]) == 0]
data = data[pd.isnull(data["occupation"]) == 0]
data = data[pd.isnull(data["native-country"]) == 0]

In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
age               45222 non-null int64
workclass         45222 non-null object
fnlwgt            45222 non-null int64
education         45222 non-null object
education-num     45222 non-null int64
marital-status    45222 non-null object
occupation        45222 non-null object
relationship      45222 non-null object
race              45222 non-null object
sex               45222 non-null object
capitalgain       45222 non-null int64
capitalloss       45222 non-null int64
hoursperweek      45222 non-null int64
native-country    45222 non-null object
class             45222 non-null object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


### Здесь я имела смелость предположить, что зарплата может зависеть от возраста, образования, профессии, рассы и пола (к большому сожалению) и возможно к стране

In [130]:
selectedColumns = data [['age', 'education', 'occupation', 'race', 'sex', 'native-country']]
X = pd.get_dummies(selectedColumns, columns = ['education', 'occupation', 'sex', 'race', 'native-country'])

In [134]:
le = LabelEncoder()
le.fit(data['class'])
le.classes_

array(['<=50K', '>50K'], dtype=object)

In [135]:
y = pd.Series(data = le.transform(data['class']))
y.head()

0    0
1    0
2    0
3    0
4    0
dtype: int32

In [136]:
X.head()

Unnamed: 0,age,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)) 
#С помощью pipeline создаю экземпляр LogisticRegression с StandardScaler() и без

# Следующий без StandardScaler()

In [142]:
model = LogisticRegression() # берем в качестве модели логистическую регресиию из scikit-learn

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [143]:
# обучаем на части датасета (train)

model.fit(X_train, y_train)
predictions = model.predict(X_test)



In [144]:
model.score(X_train, y_train)  #Здесь с StandardScaler() точность 0.7999368188279893, без 0.8000315905860054

0.8000315905860054

In [145]:
model.score(X_test,y_test) #Здесь с StandardScaler() точность 0.8037148964398909, без 0.8045256873295497

0.8045256873295497

In [57]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) #С помощью pipeline создаю экземпляр SVC с StandardScaler()
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='auto',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [58]:
clf.score(X_train, y_train)  #Здесь точность 0.8016427104722793

0.8016427104722793

In [59]:
clf.score(X_test, y_test)    #Здесь точность 0.8003980246185597

0.8003980246185597

# Изменю выбранные признаки

к моему большому удивлению, после добавления признака 'marital-status' качество увеличилось, я пробовала взять все те же переменные, но без пола и рассы, но качество упало значительно (0.7630390143737167)
Попробовала несколько вариантов, остановилась на оптимальном. Возможно можно еще лучше, но уменьшить количество признаков для get_dummies не вышло

In [146]:
selectedColumnsElse = data [['education', 'marital-status', 'race', 'occupation', 'native-country']]
X_1 = pd.get_dummies(selectedColumnsElse, columns = ['education', 'marital-status', 'race', 'occupation', 'native-country'])
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y, test_size=0.3, random_state=42)
model_1 = LogisticRegression() #т.к. было лучше так
model_1.fit(X_train_1, y_train_1)
predictions = model_1.predict(X_test_1)



In [147]:
model_1.score(X_train_1, y_train_1)     #0.8252724688042963

0.8252724688042963

In [148]:
model_1.score(X_test_1,y_test_1) #0.8251640008844991

0.8251640008844991

In [119]:
clf_1 = make_pipeline(StandardScaler(), SVC(gamma='auto')) #С помощью pipeline создаю экземпляр SVC с StandardScaler()
clf_1.fit(X_train_1, y_train_1)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='auto',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [120]:
clf_1.score(X_train_1, y_train_1)   #0.8268519981045649

0.8268519981045649

In [121]:
clf_1.score(X_test_1, y_test_1)     #0.8240583769440555

0.8240583769440555