In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [2]:
df_train = pd.read_csv('./data/train_final.csv')
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income>50K
0,53,Self-emp-not-inc,93449,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,1
1,33,Self-emp-not-inc,123424,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,47,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,40,Private,114580,HS-grad,9,Divorced,Craft-repair,Other-relative,White,Female,0,0,40,Vietnam,0
4,39,Private,115618,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,0


In [3]:
df_test = pd.read_csv('./data/test_final.csv')
df_test.head()

Unnamed: 0,ID,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,1,33,Self-emp-not-inc,222162,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
1,2,68,?,29240,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,12,United-States
2,3,34,Private,103596,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States
3,4,57,Private,103403,5th-6th,3,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States
4,5,48,Private,152915,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States


In [4]:
def attr_contribution(df: pd.DataFrame, attr: str, label: str = 'income>50K'):
    info = df.groupby([label, attr], sort=True).size().unstack(fill_value=0).stack()
    attr_vals = [col for _, col in info.keys()]
    attr_vals = attr_vals[:len(attr_vals) // 2]
    y1 = info.values[:len(attr_vals)]
    y2 = info.values[len(attr_vals):]
    total = y1 + y2
    y1 = y1 / total
    y2 = y2 / total
    df1 = pd.DataFrame.from_dict({
        'y1': y1,
        'y2': y2,
        'attr_vals': attr_vals
    })
    df1.sort_values(by='y1', inplace=True)
    contrib = {}
    for index, row in df1.iterrows():
        contrib[row['attr_vals']] = row['y2']
    return contrib


contributions = {}
for attr in ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']:
    contributions[attr] = attr_contribution(df_train, attr)
# add hack for 'Holand-Netherlands'
contributions['native.country']['Holand-Netherlands'] = contributions['native.country']['England']

def pipeline(df: pd.DataFrame, attr_contributions: dict):
    for attr, contrib in attr_contributions.items():
        # print(attr, contrib)
        for attr_val, attr_contr in contrib.items():
            df.loc[df[attr] == attr_val, attr] = attr_contr
        df[attr].astype(int)

    df.drop(columns=['education'], inplace=True)
    return df

In [5]:
def get_accuracy(model, input: pd.DataFrame, ans: pd.Series):
    return 100.*sum(model.predict(X=input) == ans)/len(ans)

In [6]:
def output_test(model, input: pd.DataFrame, file_name='tp.csv'):
    ans = pd.DataFrame.from_dict({
        'ID': np.arange(1, len(input) + 1),
        'Prediction': model.predict(input),
    })
    ans.to_csv(file_name, index=False)
    return ans

In [7]:
df = pipeline(df_train.copy(deep=True), contributions)
x, y = df.iloc[:, :-1], df.iloc[:, -1]
xmax, xmin = x.max(), x.min()
x = (x - xmin)/(xmax - xmin)

In [8]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X=x, y=y)
get_accuracy(logistic_regression, x, y)

84.476

In [9]:
svc_model = LinearSVC()
svc_model.fit(X=x, y=y)
get_accuracy(svc_model, x, y)

84.736

In [19]:
df = pipeline(df_test.copy(deep=True), contributions)
df = df.iloc[:, 1:]
df = (df-xmin)/(xmax-xmin)
# df = (df - df.min())/(df.max() - df.min())
ans1 = output_test(logistic_regression, df, 'lr.csv')
ans2 = output_test(svc_model, df, 'svc.csv')

In [20]:
submitted = pd.read_csv('./LinearSVC.csv')

In [21]:
sum(ans1['Prediction'] == submitted['Prediction'])/len(ans1)

0.9773928361714621

In [22]:
sum(ans2['Prediction'] == submitted['Prediction'])/len(ans2)

0.9885076755305763