# The Naive Bayes Classifier (NB)

In [44]:
# Import required packages for this chapter
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


# Personal Loan Acceptance.

We will be using the universalbank.csv again for this assignemnt. 

The file universalbank.csv contains data on 5000 customers of Universal Bank. The data include customer demographic information (age, income, etc.), the customer’s relationship with the bank (mortgage, securities account, etc.), and the customer response to the last personal loan campaign (Personal Loan). Among these 5000 customers, only 480 (= 9.6%) accepted the personal loan that was offered to them in the earlier campaign. In this exercise, we focus on three predictors: age, income, experience, and the outcome Personal Loan.

Partition the data into training (60%) and validation (40%) sets.

In [45]:
# Load the data into band_df dataframe bank_df
# Only keep the columns we need: Income, Experience, Age, Personal Loan.Drop the rest.
bank_df=pd.read_csv("universalbank.csv")

bank_df = bank_df.drop(['ID', 'ZIP Code', 'Family', 'CCAvg', 'CD Account','Online','Education','Securities Account','CreditCard','Mortgage'], axis=1)

# Use critical functions to explore the dataframe using print() to show results

print(bank_df.head())
print(bank_df.shape)
print(bank_df.count())
print(bank_df.describe())

   Age  Experience  Income  Personal Loan
0   25           1      49              0
1   45          19      34              0
2   39          15      11              0
3   35           9     100              0
4   35           8      45              0
(5000, 4)
Age              5000
Experience       5000
Income           5000
Personal Loan    5000
dtype: int64
               Age   Experience       Income  Personal Loan
count  5000.000000  5000.000000  5000.000000    5000.000000
mean     45.338400    20.104600    73.774200       0.096000
std      11.463166    11.467954    46.033729       0.294621
min      23.000000    -3.000000     8.000000       0.000000
25%      35.000000    10.000000    39.000000       0.000000
50%      45.000000    20.000000    64.000000       0.000000
75%      55.000000    30.000000    98.000000       0.000000
max      67.000000    43.000000   224.000000       1.000000


In [46]:
# Does the data needs further cleaning?
# If you think so, write your clearning process here.
#Somehow there's a negative value in experience, making it zero
bank_df.Experience = np.where(bank_df.Experience < 0, 0,bank_df.Experience)


print(bank_df.head())
print(bank_df.shape)
print(bank_df.count())
print(bank_df.describe())

   Age  Experience  Income  Personal Loan
0   25           1      49              0
1   45          19      34              0
2   39          15      11              0
3   35           9     100              0
4   35           8      45              0
(5000, 4)
Age              5000
Experience       5000
Income           5000
Personal Loan    5000
dtype: int64
               Age   Experience       Income  Personal Loan
count  5000.000000  5000.000000  5000.000000    5000.000000
mean     45.338400    20.119600    73.774200       0.096000
std      11.463166    11.440484    46.033729       0.294621
min      23.000000     0.000000     8.000000       0.000000
25%      35.000000    10.000000    39.000000       0.000000
50%      45.000000    20.000000    64.000000       0.000000
75%      55.000000    30.000000    98.000000       0.000000
max      67.000000    43.000000   224.000000       1.000000


In [55]:
# Split the data into training and validation sets
train_df, valid_df=train_test_split(bank_df, test_size = .4)
print("Training Set:", train_df.shape, "Validation Set:", valid_df.shape)


Training Set: (3000, 4) Validation Set: (2000, 4)


In [56]:
# Construct the model the perform analysis
outcome = 'Personal Loan'
X=list(train_df.columns)
X.remove(outcome)

train_X = train_df[X]
valid_X = valid_df[X]
train_y = train_df[outcome]
valid_y = valid_df[outcome]

print(train_X.head(), '\n', valid_X.head())

      Age  Experience  Income
2700   31           5      39
648    50          25      34
294    35           9      55
2446   25           1      70
950    32           6     112 
       Age  Experience  Income
490    34          10      90
1555   59          33      49
4163   54          28     108
2490   52          28     168
398    54          30      23


In [63]:
# Predict the classification for test dataset
# Append your prediction, predicted probability to the testing dataset and print the new dataset out using print()
# You should be able to view your prediction, observed outcome, and predictors for each data point side by side.

mnomial = MultinomialNB()
mnomial.fit(train_X, train_y)

y_predict= mnomial.predict(valid_X)
y_predProb= mnomial.predict_proba(valid_X)

predicted = pd.concat([valid_X, pd.DataFrame(y_predProb, index = valid_X.index)], axis = 1)
predicted = pd.concat([predicted, pd.DataFrame(y_predict, index = predicted.index, columns=["Predicted"])], axis = 1)
predicted = pd.concat([predicted, pd.DataFrame(valid_y, index = predicted.index)], axis = 1)

print(predicted.head(30))

      Age  Experience  Income             0             1  Predicted  \
490    34          10      90  4.716309e-03  9.952837e-01          1   
1555   59          33      49  1.000000e+00  1.222940e-13          0   
4163   54          28     108  9.987967e-01  1.203281e-03          0   
2490   52          28     168  2.570972e-06  9.999974e-01          1   
398    54          30      23  1.000000e+00  1.540208e-15          0   
1909   56          30     101  9.999788e-01  2.121606e-05          0   
3803   42          18      83  9.862212e-01  1.377878e-02          0   
4251   42          16      62  9.999482e-01  5.183422e-05          0   
957    56          32      88  9.999999e-01  1.454823e-07          0   
3573   60          36     165  1.106244e-02  9.889376e-01          1   
144    49          23      70  9.999991e-01  9.260773e-07          0   
4052   43          19      54  9.999993e-01  6.641749e-07          0   
3767   40          16      83  9.175547e-01  8.244529e-02       

In [58]:
# calculate the accuracy of your prediction against the observed outcome.
print("Accuracy:", metrics.accuracy_score(valid_y,y_predict))

# How well do you think the model does?

Accuracy: 0.789


In [59]:
# Interpret your results:
#Although it says the model is 78% accurate it doesn't appear to predict whether or not the individual will pursue a personal loan. 
# Lesson learned from this lab:




# Automobile Accidents

The file accidents.csv contains information on 42,183 actual automobile accidents in 2001 in the United States that involved one of three levels of injury: NO INJURY, INJURY, or FATALITY. For each accident, additional information is recorded, such as day of week, weather conditions, and road type. A firm might be interested in developing a system for quickly classifying the severity of an accident based on three predictors: weather conditions (WEATHER_R), traffic conditions (TRAF_CON_R), and road type (INT_HWY).

Our goal here is to predict whether an accident just reported will involve fatality (MAX_SEV_IR = 2), a non fetal injury (MAX_SEV_IR = 1) or not injury (MAX_SEV_IR = 0).

Partition the data into training (80%) and validation (20%) sets.

In [60]:
# Load the data into band_df dataframe accidents_df
# Only keep the columns we need. Drop the rest.
# Use critical functions to explore the dataframe using print() to show results
accidents_df=pd.read_csv("accidents.csv")
accidents_df = accidents_df.drop([

'HOUR_I_R',
'ALCHL_I',
'ALIGN_I',
'STRATUM_R',
'WRK_ZONE',
'WKDY_I_R',
'LGTCON_I_R',
'MANCOL_I_R',
'PED_ACC_R',
'RELJCT_I_R',
'REL_RWY_R',
'PROFIL_I_R',
'SPD_LIM',
'SUR_COND',
'TRAF_WAY',
'VEH_INVL',
'INJURY_CRASH',
'NO_INJ_I',
'PRPTYDMG_CRASH',
'FATALITIES'
], axis=1)

print(accidents_df.count())





INT_HWY       42183
TRAF_CON_R    42183
WEATHER_R     42183
MAX_SEV_IR    42183
dtype: int64


In [67]:
# Split dataset into training set and test set: 80% training and 20% validation
train_df, valid_df=train_test_split(accidents_df, test_size = .2)
print("Training Set:", train_df.shape, "Validation Set:", valid_df.shape)


Training Set: (33746, 4) Validation Set: (8437, 4)


In [68]:
# Construct the model the perform analysis
outcome = 'MAX_SEV_IR'
X=list(train_df.columns)
X.remove(outcome)

train_X = train_df[X]
valid_X = valid_df[X]
train_y = train_df[outcome]
valid_y = valid_df[outcome]

print(train_X.head(), '\n', valid_X.head())

       INT_HWY  TRAF_CON_R  WEATHER_R
29653        0           2          1
20141        0           0          1
34385        0           0          1
1548         0           2          1
12238        0           0          1 
        INT_HWY  TRAF_CON_R  WEATHER_R
9078         0           0          1
5959         0           1          1
8928         0           0          1
29814        0           0          1
41861        0           0          1


In [69]:
# Predict the classification for test dataset
# Append your prediction, predicted probability to the testing dataset and print the new dataset out using print()
# You should be able to view your prediction, observed outcome, and predictors for each data point side by side.

mnomial = MultinomialNB()
mnomial.fit(train_X, train_y)

y_predict= mnomial.predict(valid_X)
y_predProb= mnomial.predict_proba(valid_X)

predicted = pd.concat([valid_X, pd.DataFrame(y_predProb, index = valid_X.index)], axis = 1)
predicted = pd.concat([predicted, pd.DataFrame(y_predict, index = predicted.index, columns=["Predicted"])], axis = 1)
predicted = pd.concat([predicted, pd.DataFrame(valid_y, index = predicted.index)], axis = 1)

print(predicted)

       INT_HWY  TRAF_CON_R  WEATHER_R         0         1         2  \
9078         0           0          1  0.498756  0.489946  0.011298   
5959         0           1          1  0.476460  0.514482  0.009057   
8928         0           0          1  0.498756  0.489946  0.011298   
29814        0           0          1  0.498756  0.489946  0.011298   
41861        0           0          1  0.498756  0.489946  0.011298   
...        ...         ...        ...       ...       ...       ...   
19514        1           0          1  0.512589  0.470090  0.017321   
36551        0           2          1  0.453950  0.538809  0.007242   
35904        0           0          1  0.498756  0.489946  0.011298   
35885        0           2          1  0.453950  0.538809  0.007242   
39009        0           0          1  0.498756  0.489946  0.011298   

       Predicted  MAX_SEV_IR  
9078           0           0  
5959           1           1  
8928           0           0  
29814          0       

In [65]:
# compute model accuracy of your prediction against observed outcomes.
print("Accuracy:", metrics.accuracy_score(valid_y,y_predict))



# How well do you think the model does?


Accuracy: 0.789


In [70]:
# Extra Credit: Can you improve the accuracy of the model to above 0.08 by finding a different set of the predictors?
# Show you model below:
#'INJURY_CRASH' would certainly increase the prediction accuracy. 


In [15]:
# Interpret your results:
#This model appears to more accurately pick if an accident will be fatal or not, compared to the personal loan predictions which were not as accurate. 

# Lesson learned from this lab:


