<div id="reminder" style="border-radius: 5px; background-color:#f5f5f5; padding: 15px 5px; " >
<p>Use this notebook to follow along with the lab tutorial.</p>
</div>

# <font color="blue">Lesson 3 - Basic Machine Learning Models</font>

### NBC on Iris Data

In [1]:
from sklearn import datasets
import numpy as np

iris = datasets.load_iris()
msk = np.random.rand(iris.data.shape[0]) <= 0.8
iris_train = iris.data[msk, :]
iris_train_target = iris.target[msk]
iris_test = iris.data[~msk, :]
iris_test_target = iris.target[~msk]

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_model = gnb.fit(iris_train, iris_train_target)
y_pred = gnb_model.predict(iris_test)
misclassified_points = (iris_test_target != y_pred).sum()
print("Number of mislabeled points out of a total %d points : %d"\
      % (iris_test.shape[0], misclassified_points))
print("Accuracy = %.2f"%(round((iris_test.shape[0] - float(misclassified_points))/iris_test.shape[0]*100,2)))

Number of mislabeled points out of a total 38 points : 3
Accuracy = 92.11


### NBC on Adult Income Data

In [2]:
import pandas as pd
import io
import requests
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
s = requests.get(url).content
data = pd.read_csv(io.StringIO(s.decode('utf-8')), header=None)
data.columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", \
                "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", \
               "hours_per_week", "native_country", "income"]
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
for value in ['workclass', 'education',
          'marital_status', 'occupation',
          'relationship','race', 'sex',
          'native_country', 'income']:
    data[value].replace(['?'], [data.describe(include='all')[value][2]],
                                inplace=True)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
workclass_cat = le.fit_transform(data.workclass)
education_cat = le.fit_transform(data.education)
marital_cat   = le.fit_transform(data.marital_status)
occupation_cat = le.fit_transform(data.occupation)
relationship_cat = le.fit_transform(data.relationship)
race_cat = le.fit_transform(data.race)
sex_cat = le.fit_transform(data.sex)
native_country_cat = le.fit_transform(data.native_country)

#initialize the encoded categorical columns
data['workclass_cat'] = workclass_cat
data['education_cat'] = education_cat
data['marital_cat'] = marital_cat
data['occupation_cat'] = occupation_cat
data['relationship_cat'] = relationship_cat
data['race_cat'] = race_cat
data['sex_cat'] = sex_cat
data['native_country_cat'] = native_country_cat

#drop the old categorical columns from dataframe
dummy_fields = ['workclass', 'education', 'marital_status', 
                  'occupation', 'relationship', 'race',
                  'sex', 'native_country']
data = data.drop(dummy_fields, axis = 1)

data = data.reindex(['age', 'workclass_cat', 'fnlwgt', 'education_cat',
                                    'education_num', 'marital_cat', 'occupation_cat',
                                    'relationship_cat', 'race_cat', 'sex_cat', 'capital_gain',
                                    'capital_loss', 'hours_per_week', 'native_country_cat', 
                                    'income'], axis= 1)
 
data.head(5)

Unnamed: 0,age,workclass_cat,fnlwgt,education_cat,education_num,marital_cat,occupation_cat,relationship_cat,race_cat,sex_cat,capital_gain,capital_loss,hours_per_week,native_country_cat,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,<=50K


In [5]:
num_features = ['age', 'workclass_cat', 'fnlwgt', 'education_cat', 'education_num',
                'marital_cat', 'occupation_cat', 'relationship_cat', 'race_cat',
                'sex_cat', 'capital_gain', 'capital_loss', 'hours_per_week',
                'native_country_cat']
 
scaled_features = {}
for each in num_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

msk = np.random.rand(data.shape[0]) <= 0.8
adult_train = data.iloc[msk, 0:14]
adult_train_target = data.iloc[msk, 14]
adult_test = data.iloc[~msk, 0:14]
adult_test_target = data.iloc[~msk, 14]

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_model = gnb.fit(adult_train, adult_train_target)
y_pred = gnb_model.predict(adult_test)
misclassified_points = (adult_test_target != y_pred).sum()
print("Number of mislabeled points out of a total %d points : %d"\
      % (adult_test.shape[0], misclassified_points))
print("Accuracy = %.2f"%(round((adult_test.shape[0] - float(misclassified_points))/adult_test.shape[0]*100,2)))

Number of mislabeled points out of a total 6494 points : 1251
Accuracy = 80.74


<div id="reminder" style="border-radius: 5px; background-color:#f5f5f5; padding: 15px 5px; " >
<p>For additional practice, please see the Workshop notebooks.</p>
</div>