In [3]:
#Loading Libraries
import warnings
warnings.filterwarnings('ignore')

import datetime
import itertools
import os
import pathlib
import sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics as mt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin

np.random.seed(1)

In [4]:
# Header names

header_names = [
    'age',
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'wage_per_hour',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'instance_weight', ## this field is not used as a feature
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'num_emp',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'weeks_worked',
    'year',
    'income_50k',
]

In [5]:
# Load data
data_dir = os.path.join(pathlib.Path(os.getcwd()).parent, 'data')
df = pd.read_csv(os.path.join(data_dir, 'census-income.data.csv'), header=None, names=header_names)
df_test = pd.read_csv(os.path.join(data_dir, 'census-income.test.csv'), header=None, names=header_names)
df = pd.concat([df,df_test]) ## the test file is also labelled so they can be merged
df = df.drop(columns=['instance_weight']) ## not used for our analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299285 entries, 0 to 99761
Data columns (total 41 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   age               299285 non-null  int64 
 1   class_worker      299285 non-null  object
 2   det_ind_code      299285 non-null  int64 
 3   det_occ_code      299285 non-null  int64 
 4   education         299285 non-null  object
 5   wage_per_hour     299285 non-null  int64 
 6   hs_college        299285 non-null  object
 7   marital_stat      299285 non-null  object
 8   major_ind_code    299285 non-null  object
 9   major_occ_code    299285 non-null  object
 10  race              299285 non-null  object
 11  hisp_origin       299285 non-null  object
 12  sex               299285 non-null  object
 13  union_member      299285 non-null  object
 14  unemp_reason      299285 non-null  object
 15  full_or_part_emp  299285 non-null  object
 16  capital_gains     299285 non-null  int6

In [7]:
# Categorical features in the data, variable will be used for one-hot encoding
categorical_features = [
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'year',
]
df[categorical_features] = df[categorical_features].astype('category')

In [8]:
### Drop columns not used in modelling
df = df.drop(
    columns=[
        'region_prev_res',
        'state_prev_res',
        'det_hh_fam_stat',
        'det_hh_summ',
        'mig_chg_msa',
        'mig_chg_reg',
        'mig_move_reg',
        'mig_same',
        'mig_prev_sunbelt',
        'country_father',
        'country_mother',
        'country_self',
        'year',
    ]
)

In [9]:
def preprocess_pipeline(numeric_features, categorical_features):
    ### Scale numerical, one hot categorical

    numeric_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
        ]
    )
    categorical_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)]
    )
    preprocess_pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
        ]
    )
    
    return preprocess_pipe

In [10]:
# This is section one of the data processing, will be using the same data as shown in the minilab
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299285 entries, 0 to 99761
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   age               299285 non-null  int64   
 1   class_worker      299285 non-null  category
 2   det_ind_code      299285 non-null  category
 3   det_occ_code      299285 non-null  category
 4   education         299285 non-null  category
 5   wage_per_hour     299285 non-null  int64   
 6   hs_college        299285 non-null  category
 7   marital_stat      299285 non-null  category
 8   major_ind_code    299285 non-null  category
 9   major_occ_code    299285 non-null  category
 10  race              299285 non-null  category
 11  hisp_origin       299285 non-null  category
 12  sex               299285 non-null  category
 13  union_member      299285 non-null  category
 14  unemp_reason      299285 non-null  category
 15  full_or_part_emp  299285 non-null  category
 16  cap

In [11]:
# Sampling the dataset
selection_df = df.sample(frac = .07)

In [12]:
selection_df.shape

(20950, 28)

In [13]:
numeric_features = selection_df.select_dtypes(include=['int64', 'float64']).columns
categorical_features_income = selection_df.select_dtypes(include=['object','bool', 'category']).drop(['income_50k'], axis=1).columns
X_selection_income = selection_df.drop('income_50k', axis=1)
y_selection_income = pd.DataFrame(selection_df['income_50k'])
preprocessor_income = preprocess_pipeline(numeric_features, categorical_features_income)
X_selection_preprocessed_income = preprocessor_income.fit_transform(X_selection_income)

In [14]:
higer_degrees = [
    ' Bachelors degree(BA AB BS)', 
    ' Masters degree(MA MS MEng MEd MSW MBA)', 
    ' Prof school degree (MD DDS DVM LLB JD)',
    ' Doctorate degree(PhD EdD)',
]
selection_df['graduated'] = 'no'
selection_df.loc[selection_df['education'].isin(higer_degrees), 'graduated'] = 'yes'
selection_df = selection_df.drop(['education', 'hs_college'], axis=1)

In [15]:
selection_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20950 entries, 67087 to 17653
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   age               20950 non-null  int64   
 1   class_worker      20950 non-null  category
 2   det_ind_code      20950 non-null  category
 3   det_occ_code      20950 non-null  category
 4   wage_per_hour     20950 non-null  int64   
 5   marital_stat      20950 non-null  category
 6   major_ind_code    20950 non-null  category
 7   major_occ_code    20950 non-null  category
 8   race              20950 non-null  category
 9   hisp_origin       20950 non-null  category
 10  sex               20950 non-null  category
 11  union_member      20950 non-null  category
 12  unemp_reason      20950 non-null  category
 13  full_or_part_emp  20950 non-null  category
 14  capital_gains     20950 non-null  int64   
 15  capital_losses    20950 non-null  int64   
 16  stock_dividends   

In [16]:
categorical_features_grad = selection_df.select_dtypes(include=['object','bool', 'category']).drop(['graduated'], axis=1).columns
X_selection_grad = selection_df.drop(columns=['graduated'])
y_selection_grad = pd.DataFrame(selection_df['graduated'])
preprocessor_grad = preprocess_pipeline(numeric_features, categorical_features_grad)
X_selection_preprocessed_grad = preprocessor_grad.fit_transform(X_selection_grad)

In [17]:
tmp_df = pd.get_dummies(y_selection_income['income_50k'])

y_selection_income = pd.concat((y_selection_income,tmp_df),axis=1)

y_selection_income["above_50k"] = y_selection_income[" 50000+."]

y_selection_income=y_selection_income.drop(['income_50k', " - 50000.", " 50000+."], axis=1)

In [18]:
tmp_df = pd.get_dummies(y_selection_grad['graduated'])

y_selection_grad = pd.concat((y_selection_grad,tmp_df),axis=1)

y_selection_grad["graduated"] = y_selection_grad["yes"]

y_selection_grad=y_selection_grad.drop(['yes', "no"], axis=1)

In [19]:
### Make a list of all the columns after one hot encoding
ohe_income = preprocessor_income['preprocessor'].named_transformers_['cat']['onehot']
cat_processed_income = ohe_income.get_feature_names(X_selection_income[categorical_features_income].columns)
all_processed_cols_income = np.concatenate((numeric_features, cat_processed_income), axis=0)

In [20]:
ohe_grad = preprocessor_grad['preprocessor'].named_transformers_['cat']['onehot']
cat_processed_grad = ohe_grad.get_feature_names(X_selection_grad[categorical_features_grad].columns)
all_processed_cols_grad = np.concatenate((numeric_features, cat_processed_grad), axis=0)

In [31]:
selection_df.to_csv('selectionDF.csv')

In [24]:
from rpy2.robjects.packages import importr
from rpy2 import robjects as robj

In [27]:
%load_ext rmagic
%load_ext rpy2.ipython

arules = importr('arules')
arules_viz = importr('arulesViz')

The rmagic extension is already loaded. To reload it, use:
  %reload_ext rmagic


In [30]:
%R load('census-income.data.csv')
%R rules <- apriori(census-income.data.csv,parameter = list(minlen=2, supp=0.05, conf=0.8))
%R rules.sorted <- sort(rules, by="lift")
%R plot(rules.sorted, method="grouped")

print('Arules run now from R, test complete')


Error in readChar(con, 5L, useBytes = TRUE) : cannot open the connection

Error in .class1(object) : object 'census' not found

Error in sort(rules, by = "lift") : object 'rules' not found

Error in plot(rules.sorted, method = "grouped") : 
  object 'rules.sorted' not found
Arules run now from R, test complete
