In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix


In [2]:
data = pd.read_csv("/content/income_evaluation.csv")

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.shape

(32561, 15)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
data.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [7]:
np.unique(data[' income'])

array([' <=50K', ' >50K'], dtype=object)

In [8]:
np.unique(data[' marital-status'])

array([' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
       ' Married-spouse-absent', ' Never-married', ' Separated',
       ' Widowed'], dtype=object)

In [9]:
data.drop(columns = " fnlwgt",inplace = True)

In [10]:
cols = ['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income']
data.columns = cols

data.columns

Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income'],
      dtype='object')

In [11]:
data.isnull().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [12]:
bins = [16,24,65,90]
labels = ["young","adult","old"]
data["age_type"] = pd.cut(data["age"],bins = bins,labels = labels)
data["income_type"] = np.where(data["income"] == ">50K",1,0)

In [13]:
np.unique(data["education"])

array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
       ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
       ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
       ' Some-college'], dtype=object)

In [14]:
strip_cols = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country","income"]

for i in strip_cols:
    data[i] = data[i].str.strip()

In [15]:
categorical = [var for var in data.columns if data[var].dtype == 'O']
print("there are {} categorical columns",format(len(categorical)))
print("Categorical columns are:\n\n ",categorical)



there are {} categorical columns 9
Categorical columns are:

  ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


In [16]:
data.loc[data['workclass'] == "?"]

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_type,income_type
27,54,?,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K,adult,0
61,32,?,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K,adult,0
69,25,?,Some-college,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K,adult,0
77,67,?,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K,old,0
106,17,?,10th,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K,young,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K,adult,0
32531,30,?,Bachelors,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K,adult,0
32539,71,?,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K,old,0
32541,41,?,HS-grad,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K,adult,0


In [17]:
data.loc[data['workclass'] == "?" , "workclass"] = np.nan

In [18]:
data.loc[data['occupation'] == "?" , "occupation"] = np.nan
data.loc[data['native-country'] == "?" , "native-country"] = np.nan

In [19]:
data.loc[data['native-country'] == np.nan]

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_type,income_type


In [20]:
data.occupation.value_counts(),data['native-country'].value_counts()

(Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
 Name: occupation, dtype: int64,
 United-States                 29170
 Mexico                          643
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          

### Encoding

```
# This is formatted as code
```



In [21]:
numerical = [var for var in data.columns if data[var].dtype == "int64"]

numerical

['age',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'income_type']

In [22]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

def label_encoder(b):
    le = LabelEncoder()
    data[b] = le.fit_transform(data[b])

label_list = ['workclass', 'education','marital-status',
       'occupation', 'relationship', 'race', 'sex','native-country', 'income']

for i in label_list:
    label_encoder(i)

In [23]:
data =data.drop(columns = "age_type")
x= data.drop(columns = "income")
y=data['income']

###Train Test Split

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [25]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 101,shuffle = True)
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((26048, 14), (26048,), (6513, 14), (6513,))

In [26]:
# To store results of models
result_dict_train = {}
result_dict_test = {}

###Model Building (SVM)

In [27]:
svc = SVC()
accuracies = cross_val_score(svc, x_train, y_train, cv=5)
svc.fit(x_train,y_train)

In [28]:
print("Train Score:",np.mean(accuracies))
print("Test Score:",svc.score(x_test,y_test))

Train Score: 0.8011360744673051
Test Score: 0.8074619990787656


### Hyperparameters Tuning for C,Gamma,Kernel and Degree

In [29]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = {
    'C' :[0.01,0.1,1,10],
    'kernel' : ['linear','poly','rbf','sigmoid'],
    'degree' : [1,3,5,7],
    'gamma' : [0.01,1]
}

svm = SVC()
svm_cv = GridSearchCV(svm,grid,cv =5)
svm_cv.fit(x_train,y_train)
print("best parameters",svm_cv.best_params_)
print("Train score",svm_cv.best_score_)
print("test score",svm_cv.score(x_test,y_test))

The Dataset in practice is a large dataset and SVM does not perform great with large datasets.It takes too much time to train+hyperparameter tuning with GridSearchCV