# Problem 2:
1) Prepare a classification model using SVM for salary data 

## 1. Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve

In [2]:
%matplotlib notebook

## 2. Load data

In [3]:
salary_df = pd.read_csv('SalaryData_Train(1).csv')

In [4]:
salary_df.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## 3. EDA
### 3.1 Data understanding 

In [5]:
salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [6]:
salary_df['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' 7th-8th', ' Doctorate',
       ' Assoc-voc', ' Prof-school', ' 5th-6th', ' 10th', ' Preschool',
       ' 12th', ' 1st-4th'], dtype=object)

In [7]:
len(salary_df['education'].unique())

16

In [8]:
salary_df['educationno'].unique() # label/ ordinal encoding of 'education' column - therefore, remove.

array([13,  9,  7, 14,  5, 10, 12,  4, 16, 11, 15,  3,  6,  1,  8,  2],
      dtype=int64)

### 3.2 Separating data into features and target.

In [9]:
# Extracting column names and sorting them to appropriate categories.
def column_segregator(df, y_name=None):
    """ Returns  three lists of column headers for feature columns, 
    numeric columns and categorical columns
    Input
    ------
    df: Dataframe
    y_name: default None. Name(str) of target column if available
    
    Output
    ------
    features, numeric_cols, cat_cols"""   
    
    cols = df.columns # List of all columns in the input dataframe.
    numeric_cols = [col for col in cols if (df[col].dtypes != 'object') and col != y_name]
    cat_cols = [col for col in cols if (df[col].dtypes == 'object') and col != y_name]
    features = [col for col in cols if col != y_name]
    
    return features, numeric_cols, cat_cols 

In [10]:
def Xy_split(df, y_name=None, y_col=True):
    """Splits the input dataframe into features and target
    input
    -----
    df: Input dataframe
    y_name: default None. Name(str) of target column if available
    y_col: 'True' if y column is present in input dataframe, else 'False'.
    
    output
    ------
    X (features) , y (target) if y colum is present else only X"""
    
    target = y_name
    feature_col,_,_ = column_segregator(df, target)
    if y_col == True:
        # separating features and target.
        X = df.loc[:, feature_col]
        y = df.loc[:, target]
        return X,y
    else:
        X = df.loc[:, feature_col]
        return X

In [11]:
# Column segregation.
features, numeric_cols, cat_cols = column_segregator(salary_df, y_name='Salary')

In [12]:
numeric_cols.remove('educationno')
features.remove('educationno')

In [13]:
len(numeric_cols), len(cat_cols)

(4, 8)

In [14]:
X, y = Xy_split(salary_df, y_name='Salary', y_col=True)

In [15]:
numeric_cols

['age', 'capitalgain', 'capitalloss', 'hoursperweek']

In [16]:
cat_cols

['workclass',
 'education',
 'maritalstatus',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native']

In [17]:
features

['age',
 'workclass',
 'education',
 'maritalstatus',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capitalgain',
 'capitalloss',
 'hoursperweek',
 'native']

### 3.3 Visualizations

#### 3.3.1 Feature distribution - Numeric features:

In [18]:
fig, axes = plt.subplots(2,2, figsize=(9,6))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.histplot(data=X, x=X[numeric_cols[idx]], ax=ax)
    
fig.suptitle('Feature distribution - Numeric features', ha='center', fontweight='bold')
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### Observations:
- 13 features, 1 target. There is a mix of numerical and categorical features.
- No null values. All features recorded with correct datatypes.
- Target needs to be segregated into 0 and 1 class.
 - salary <=50K - 0 class.
 - salary >50K - 1 class.
- The feaures are not distributed normally.

#### 3.3.2 Feature distribution - categorical features

In [19]:
fig, ax = plt.subplots(figsize=(10,4))
sns.countplot(data=X, x=X[cat_cols[0]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [20]:
fig, ax = plt.subplots(figsize=(8,4))
sns.countplot(data=X, y=X[cat_cols[1]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [21]:
fig, ax = plt.subplots(figsize=(8,4))
sns.countplot(data=X, y=X[cat_cols[2]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [22]:
fig, ax = plt.subplots(figsize=(8,4))
sns.countplot(data=X, y=X[cat_cols[3]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [23]:
fig, ax = plt.subplots(figsize=(8,4))
sns.countplot(data=X, x=X[cat_cols[4]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [24]:
fig, ax = plt.subplots(figsize=(10,4))
sns.countplot(data=X, x=X[cat_cols[5]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [25]:
fig, ax = plt.subplots(figsize=(4,4))
sns.countplot(data=X, x=X[cat_cols[6]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [26]:
fig, ax = plt.subplots(figsize=(8,10))
sns.countplot(data=X, y=X[cat_cols[7]], ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

#### 3.3.3 Target distribution 

In [27]:
fig, ax = plt.subplots(figsize=(4,4))
sns.countplot(data=X, x=y, ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [28]:
enc = OneHotEncoder(sparse=False)
X_cat_enc = pd.DataFrame(enc.fit_transform(X[cat_cols]), index=X.index)

In [29]:
X_prep = pd.concat([X[numeric_cols], X_cat_enc], axis=1)

In [30]:
X_prep.head()

Unnamed: 0,age,capitalgain,capitalloss,hoursperweek,0,1,2,3,4,5,...,87,88,89,90,91,92,93,94,95,96
0,39,2174,0,40,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50,0,0,13,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,38,0,0,40,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,53,0,0,40,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,28,0,0,40,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
X_prep.shape

(30161, 101)

In [32]:
y.unique()

array([' <=50K', ' >50K'], dtype=object)

In [33]:
## Converting target label to 0 and 1

In [34]:
y_prep = y.apply(lambda x:1 if(x==' >50K') else 0)

In [35]:
y_prep.unique()

array([0, 1], dtype=int64)

In [36]:
fig, ax = plt.subplots(figsize=(4,4))
sns.countplot(data=X, x=y_prep, ax=ax) 
plt.show()

<IPython.core.display.Javascript object>

In [37]:
### 3.3.4 Heatmap to find correlations
df_heatmap = pd.concat([X_prep, y_prep], axis=1)
fig, ax = plt.subplots(figsize=(13,8))
sns.heatmap(df_heatmap.corr(), ax=ax, annot=False)
plt.show()

<IPython.core.display.Javascript object>

In [38]:
def cluster_visualization(df_sc, labels):
    # PCA - N-D to 2D for scatterplot.
    pca = PCA(2)
    df_pca = pca.fit_transform(df_sc)
    u_labels = np.unique(labels)
    
    # Plotting
    fig, ax = plt.subplots()
    for i in u_labels:
        ax.scatter(df_pca[labels == i, 0], df_pca[labels == i, 1], label=i)
        
    ax.legend()    

In [39]:
scaler = MinMaxScaler()#StandardScaler()#
X_prep_sc = pd.DataFrame(scaler.fit_transform(X_prep), columns=X_prep.columns)



In [40]:
cluster_visualization(X_prep_sc, y_prep)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('output categories - clustered')
plt.show()



<IPython.core.display.Javascript object>

## 4. Model Building:

### 4.1 Baseline model with cross validation

In [41]:
# Data preprocessing
categorical_transformer = Pipeline(steps=[
    ('cat_enc', OneHotEncoder())
])
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

In [42]:
cat_cols

['workclass',
 'education',
 'maritalstatus',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native']

In [43]:
numeric_cols

['age', 'capitalgain', 'capitalloss', 'hoursperweek']

In [44]:
preprocessor = ColumnTransformer(transformers=[
    ('cat_trf', categorical_transformer, cat_cols),
    ('num_trf', numeric_transformer, numeric_cols)   
], remainder='drop')

In [45]:
sv_clf = SVC()

In [46]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', sv_clf)
])

In [47]:
def cross_validator(clf, Xdf, ydf, cv=5):
    # K fold cross validation:
    crv_scores_acc = cross_val_score(estimator=clf, X=Xdf, y=ydf, cv=cv, scoring='accuracy')
    crv_scores_prec = cross_val_score(estimator=clf, X=Xdf, y=ydf, cv=cv, scoring='precision')
    crv_scores_rec = cross_val_score(estimator=clf, X=Xdf, y=ydf, cv=cv, scoring='recall')
    
    vals = [round(crv_scores_acc.mean(),4),
            round(crv_scores_prec.mean(),4),
            round(crv_scores_rec.mean(),4)
           ]
    return vals

In [48]:
def model_summary(vals):
    # Summarizing scores:
    print("mean accuracy            :{} ".format(vals[0]))
    print("mean precision(1 class)  :{} ".format(vals[1]))
    print("mean recall(1 class)     :{} ".format(vals[2]))

In [None]:
baseline_scores = cross_validator(clf=clf,
                                  Xdf=X,
                                  ydf=y_prep,
                                  cv=5)

In [None]:
scores = {}

In [None]:
scores['baseline'] = baseline_scores

In [None]:
model_summary(scores['baseline'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_prep,
                                                    test_size=0.2,
                                                    stratify=y_prep,
                                                    random_state=42)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred_bl = clf.predict(X_test)

In [None]:
def display_results(y_test, y_pred, clf):
    """Displays model evaluation/performance report that includes
    accuracy_score, confusion_matrix, precision_score, and 
    recall_score.
    input
    -----
    y_test, y_pred
    
    output
    ------
    Model evaluation/performance report"""
    print(classification_report(y_test, y_pred))
    #print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    fig, ax = plt.subplots()
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, labels=clf.classes_, ax=ax)

In [None]:
display_results(y_test, y_pred_bl, clf=clf)

### 4.2 Gridsearch for best params

Note: Since the search time will be large, we may try gridsearch with test data since it is a stratified sample of the original dataset.

In [None]:
#clf_gs.get_params().keys()

In [None]:
X_prep_sc.head()

In [None]:
X_t, X_ts, y_t, y_ts = train_test_split(X_prep_sc, y_prep, test_size=0.1, stratify=y_prep)

In [None]:
X_t.shape, X_ts.shape, y_t.shape, y_ts.shape

In [None]:
sv_clf_gs = SVC()

In [None]:
# parameters = {'classifier__kernel':['linear', 'poly', 'rbf', 'sigmoid'],
#                'classifier__gamma':[0.001, 0.01, 0.1, 0.5, 0.8, 1, 5, 10, 20, 50],
#                'classifier__C':[0.001, 0.01, 0.1, 0.5, 0.8, 1, 5, 10, 20, 50]
#              }

# parameters = {'classifier__kernel':['poly'],
#                'classifier__gamma':[0.01],
#                'classifier__C':[0.01,]
#              }

parameters = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
               'gamma':[0.001, 0.01, 0.1, 0.5, 0.8, 1, 5, 10, 20, 50],
               'C':[0.001, 0.01, 0.1, 0.5, 0.8, 1, 5, 10, 20, 50]
             }


In [None]:
grid_search = GridSearchCV(estimator=sv_clf_gs,
    param_grid=parameters,
    scoring='roc_auc',
    n_jobs=-1,
    cv=5)

In [None]:
%%time
grid_search.fit(X_ts, y_ts)

In [None]:
grid_search.best_params_ 

In [None]:
grid_search.best_score_ 

In [None]:
sv_clf_g1 = SVC(C=10,
    kernel='linear',
    gamma=0.001)

clf_gs1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', sv_clf_g1)
])

In [None]:
clf_gs1.fit(X_train, y_train)

In [None]:
y_pred_gl = clf_gs1.predict(X_test)

In [None]:
display_results(y_test, y_pred_gl, clf=clf_gs1)