In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
header_names = [
    'age',
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'wage_per_hour',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'instance_weight', ## this field is not used as a feature
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'num_emp',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'weeks_worked',
    'year',
    'income_50k',
]



In [None]:
df1=pd.read_csv("../input/ml1-project/census-income.data.csv",header = None, names = header_names)
df2=pd.read_csv("../input/testset/census-income.test.csv",header = None, names = header_names)

df = pd.concat([df1, df2]) #The test file, labeled so it can be merged with original 
df.drop(columns = ['instance_weight'])

In [None]:
## Create a new variable for classification based of if the person recieved a 
## college degree
higer_degrees = [
    ' Bachelors degree(BA AB BS)', 
    ' Masters degree(MA MS MEng MEd MSW MBA)', 
    ' Prof school degree (MD DDS DVM LLB JD)',
    ' Doctorate degree(PhD EdD)',
]

df['graduated'] = 'no'
df.loc[df['education'].isin(higer_degrees), 'graduated'] = 'yes'

In [None]:
df.shape
df.head()
list(df.columns)

In [None]:
sns.scatterplot(data=df, x="capital_gains", y="age", hue="income_50k", alpha=.4)

In [None]:
sns.scatterplot(data=df, x="capital_losses", y="age", hue="income_50k", alpha=.4)

In [None]:
sns.scatterplot(data=df, x="stock_dividends", y="age", hue="income_50k", alpha=.4)

In [None]:
cols_to_keep=[
    'age', 
    'education', 
    'race', 
    'sex', 
    'capital_gains', 
    'capital_losses', 
    'stock_dividends', 
    'tax_filer_stat', 
    'det_hh_summ', 
    'own_or_self', 
    'vet_benefits', 
    'weeks_worked',
    'income_50k'
]

df_trunc = df.loc[:,cols_to_keep]

df_trunc.head()

In [None]:
ind_cols=['education', 'race', 'sex', 'tax_filer_stat', 'det_hh_summ']

df_trunc.loc[:,ind_cols].head()


In [None]:
# perform one-hot encoding 
tmp_df = pd.get_dummies(df_trunc.loc[:,ind_cols])
df_trunc=df_trunc.drop(['education', 'race', 'sex', 'tax_filer_stat', 'det_hh_summ'], axis=1)
df_trunc_ind = pd.concat((df_trunc,tmp_df),axis=1) # add back into the dataframe

list(df_trunc_ind.columns)

In [None]:
for col in ["own_or_self", "vet_benefits", "weeks_worked", "income_50k"]:
    df_trunc_ind[col] = df_trunc_ind[col].astype('category')
    
df_trunc_ind.info()    

In [None]:
from sklearn.model_selection import train_test_split

y = df_trunc_ind['income_50k']

X = df_trunc_ind.drop('income_50k', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


# NEED to make pipeline or columntransformer 

scaler = StandardScaler()

df_scaled = df_trunc_ind.copy()

cols_to_scale = ['capital_gains', 'capital_losses', 'stock_dividends']

features = df_scaled[cols_to_scale]

scaler = StandardScaler().fit(features.values)

features = scaler.transform(features.values)

df_scaled[cols_to_scale] = features

df_scaled.info()





In [None]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [None]:
y.unique()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=100000)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, lr.predict(X_test))
print("{0:.1%} accuracy on test set.".format(acc)) 
# print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

In [None]:
# from sklearn.svm import SVC
# 
# svc = SVC() #MAKE SURE TO ADD KERNEL
# 
# # Fit the model to the training data
# svc.fit(X_train, y_train)
# 
# # Calculate accuracy scores on both train and test data
# accuracy_train = accuracy_score(y_train, svc.predict(X_train))
# accuracy_test = accuracy_score(y_test, svc.predict(X_test))
# 
# print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train))

In [None]:
# from sklearn.feature_selection import RFE
# # Create the RFE with a LogisticRegression estimator and 3 features to select
# rfe = RFE(estimator=LogisticRegression(), n_features_to_select=10, verbose=1)
# 
# # Fits the eliminator to the data
# rfe.fit(X_train, y_train)
# 
# # Print the features and their ranking (high = dropped early on)
# print(dict(zip(X.columns, rfe.ranking_)))
# 
# # Print the features that are not eliminated
# print(X.columns[rfe.support_])
# 
# # Calculates the test set accuracy
# acc = accuracy_score(y_test, rfe.predict(X_test))
# print("{0:.1%} accuracy on test set.".format(acc)) 