In [43]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

In [44]:
from sklearn import preprocessing 
label = preprocessing.LabelEncoder() 
data = pd.read_csv('income_evaluation.csv', sep=',')
for y in data.columns:
    if(data[y].dtype == object):
        data[y]= label.fit_transform(data[y])
X = data
X = X.drop([' income'], axis='columns')
y = data[' income']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   age              32561 non-null  int64
 1    workclass       32561 non-null  int32
 2    fnlwgt          32561 non-null  int64
 3    education       32561 non-null  int32
 4    education-num   32561 non-null  int64
 5    marital-status  32561 non-null  int32
 6    occupation      32561 non-null  int32
 7    relationship    32561 non-null  int32
 8    race            32561 non-null  int32
 9    sex             32561 non-null  int32
 10   capital-gain    32561 non-null  int64
 11   capital-loss    32561 non-null  int64
 12   hours-per-week  32561 non-null  int64
 13   native-country  32561 non-null  int32
dtypes: int32(8), int64(6)
memory usage: 2.5 MB


In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [46]:
from sklearn.linear_model import Ridge
def ridge_regression(X_train, y_train, alpha):
    ridgereg = Ridge(alpha=alpha, normalize=True)
    ridgereg.fit(X_train, y_train)
    y_pred = ridgereg.predict(X_train)
    rss = sum((y_pred-y_train)**2)
    ret=[rss]
    ret.extend([ridgereg.intercept_])
    ret.extend(ridgereg.coef_)
    return ret

In [47]:
alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]
col = ['rss', 'intercept'] + ['coef_x_%d'%i for i in range(0, X_train.shape[1])]
ind = ['alpha_%.2g'%alpha_ridge[i] for i in range(0,10)]
coef_matrix_ridge = pd.DataFrame(index=ind, columns=col)
for i in range(10):
    coef_matrix_ridge.iloc[i,] = ridge_regression(X_train, y_train, alpha_ridge[i])

In [48]:
coef_matrix_ridge

Unnamed: 0,rss,intercept,coef_x_0,coef_x_1,coef_x_2,coef_x_3,coef_x_4,coef_x_5,coef_x_6,coef_x_7,coef_x_8,coef_x_9,coef_x_10,coef_x_11,coef_x_12,coef_x_13
alpha_1e-15,3289.759237,-0.598731,0.004748,-0.003074,0.0,-0.003327,0.047159,-0.023785,0.002042,-0.015461,0.014368,0.105948,1e-05,0.000111,0.003499,-1e-05
alpha_1e-10,3289.759237,-0.598731,0.004748,-0.003074,0.0,-0.003327,0.047159,-0.023785,0.002042,-0.015461,0.014368,0.105948,1e-05,0.000111,0.003499,-1e-05
alpha_1e-08,3289.759237,-0.598731,0.004748,-0.003074,0.0,-0.003327,0.047159,-0.023785,0.002042,-0.015461,0.014368,0.105948,1e-05,0.000111,0.003499,-1e-05
alpha_0.0001,3289.759244,-0.598661,0.004748,-0.003074,0.0,-0.003325,0.047153,-0.023784,0.002042,-0.015464,0.014367,0.105935,1e-05,0.000111,0.003499,-1e-05
alpha_0.001,3289.759937,-0.598029,0.004744,-0.003065,0.0,-0.003311,0.047107,-0.023779,0.002044,-0.01549,0.014361,0.105819,1e-05,0.000111,0.003497,-1e-05
alpha_0.01,3289.82775,-0.591793,0.004706,-0.002977,0.0,-0.003175,0.046644,-0.023724,0.002059,-0.015747,0.0143,0.104687,1e-05,0.00011,0.003485,-7e-06
alpha_1,3471.339408,-0.261788,0.002727,0.001151,0.0,0.001189,0.023965,-0.017464,0.002013,-0.017913,0.010383,0.061501,6e-06,6.4e-05,0.002457,0.000126
alpha_5,3945.641857,0.033027,0.001085,0.001544,-0.0,0.001074,0.008696,-0.007875,0.000997,-0.009003,0.004962,0.027387,2e-06,2.4e-05,0.001092,0.000101
alpha_10,4144.342627,0.120281,0.000623,0.001049,-0.0,0.000682,0.004884,-0.004647,0.0006,-0.00542,0.002983,0.016253,1e-06,1.4e-05,0.000643,6.5e-05
alpha_20,4283.735423,0.175253,0.000337,0.000619,-0.0,0.000388,0.002606,-0.002552,0.000333,-0.00301,0.001657,0.008965,1e-06,7e-06,0.000353,3.8e-05


In [49]:
from sklearn.linear_model import Lasso
def lasso_regression(X_train, y_train, alpha):
    lassoreg = Lasso(alpha=alpha, normalize=True, max_iter=1e5)
    lassoreg.fit(X_train, y_train)
    y_pred = lassoreg.predict(X_train)
    rss = sum((y_pred-y_train)**2)
    ret=[rss]
    ret.extend([lassoreg.intercept_])
    ret.extend(lassoreg.coef_)
    return ret

In [50]:
alpha_lasso = [1e-15, 1e-10, 1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 1, 5, 10]
col = ['rss', 'intercept'] + ['coef_x_%d'%i for i in range(0, X_train.shape[1])]
ind = ['alpha_%.2g'%alpha_lasso[i] for i in range(0,10)]
coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)
for i in range(10):
    coef_matrix_lasso.iloc[i,] = lasso_regression(X_train, y_train, alpha_lasso[i])

In [51]:
coef_matrix_lasso

Unnamed: 0,rss,intercept,coef_x_0,coef_x_1,coef_x_2,coef_x_3,coef_x_4,coef_x_5,coef_x_6,coef_x_7,coef_x_8,coef_x_9,coef_x_10,coef_x_11,coef_x_12,coef_x_13
alpha_1e-15,3289.759237,-0.598731,0.004748,-0.003074,0.0,-0.003327,0.047159,-0.023785,0.002042,-0.015461,0.014368,0.105948,1e-05,0.000111,0.003499,-1e-05
alpha_1e-10,3289.759237,-0.598731,0.004748,-0.003074,0.0,-0.003327,0.047159,-0.023785,0.002042,-0.015461,0.014368,0.105948,1e-05,0.000111,0.003499,-1e-05
alpha_1e-08,3289.759238,-0.598726,0.004748,-0.003073,0.0,-0.003326,0.047158,-0.023784,0.002041,-0.015461,0.014366,0.105947,1e-05,0.000111,0.003499,-1e-05
alpha_1e-05,3290.428585,-0.586334,0.004672,-0.001417,0.0,-0.002766,0.04647,-0.023169,0.001634,-0.015436,0.012676,0.104274,9e-06,0.000107,0.003418,-0.0
alpha_0.0001,3328.273434,-0.451664,0.004005,0.0,0.0,-0.0,0.041485,-0.018266,0.0,-0.015344,0.0,0.088301,8e-06,7.8e-05,0.002856,0.0
alpha_0.001,4464.695291,0.240827,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
alpha_0.01,4464.695291,0.240827,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
alpha_1,4464.695291,0.240827,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
alpha_5,4464.695291,0.240827,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
alpha_10,4464.695291,0.240827,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
kfold = KFold(n_splits=6, shuffle=True, random_state=10)
accuracy= []
C_list = (10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3, 10**4, 10**5)

for i in range(0, len(C_list)):
    clf = LogisticRegression(C=C_list[i], random_state=10, max_iter=500) #внутри есть регуляризация
    current_score = cross_val_score(clf, X_train, y_train, cv=kfold).mean()
    accuracy.append(current_score)
    print('При С =', C_list[i], 'точность равна', current_score)
max_score = 0
for i in range(0, len(accuracy)):
    if max_score < accuracy[i]:
        max_score = accuracy[i]
        C_for_max_score = i
print('\n')
print('Максимальная точность, равная', max_score, ', получается при C, равном', C_list[C_for_max_score])

При С = 1e-05 точность равна 0.7961506961506962
При С = 0.0001 точность равна 0.7922604422604423
При С = 0.001 точность равна 0.7896805896805897
При С = 0.01 точность равна 0.7915233415233415
При С = 0.1 точность равна 0.7884520884520884
При С = 1 точность равна 0.7881654381654383
При С = 10 точность равна 0.7885749385749387
При С = 100 точность равна 0.7881244881244882
При С = 1000 точность равна 0.7882473382473383
При С = 10000 точность равна 0.7900491400491401
При С = 100000 точность равна 0.7880835380835381


Максимальная точность, равная 0.7961506961506962 , получается при C, равном 1e-05


In [53]:
clf = LogisticRegression(C=C_list[C_for_max_score], random_state=10, max_iter=500)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7911804446628178

In [54]:
X = X.drop([' fnlwgt', ' capital-gain', ' capital-loss', ' native-country'], axis='columns')

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [56]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8000245670065103