In [5]:
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier

# Import the trees from sklearn
from sklearn import tree

# Helper functions to visualize our trees
from sklearn.tree import plot_tree, export_text# Import our libraries 

from sklearn.model_selection import GridSearchCV



## Main Steps when building a Machine Learning Model. 
1. Inspect and explore data.
2. Select and engineer features.
3. Build and train model.
4. Evaluate model.

# #1 Inspect and explore data.
* Load titanic data
* Visualize all the data using sns.pairplot
* Check for null values

In [6]:
# Load in the titanic data set.
df = pd.read_csv('data/titanic.csv')
df.head()
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [3]:
# Visualize all the data using sns.pairplot
sns.pairplot(df, hue = 'pclass', markers=["o", "s", "D"], palette="husl")
plt.show()
#how different variables are distributed or related within each passenger class.

KeyboardInterrupt: 

In [7]:
# Check for null values
df.isnull().sum()

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

# #2 Select and engineer features.
1. Fill age null values with -999
1. Convert to numerical values if need be by using `pd.get_dummies()`
1. Create a list of the features you are going to use.  In this case use as many or as little as you would like.
1. Define our `X` and `y`
1. Split our data into trainig and testing sets.

In [10]:
# Fill age null values with -999
df['age'].fillna(-999, inplace = True)
df.age
df['age'] = df['age'].astype(int)
df.age

0       22
1       38
2       26
3       35
4       35
      ... 
886     27
887     19
888   -999
889     26
890     32
Name: age, Length: 891, dtype: int64

In [11]:
# 1. Convert to numerical values if need be by using `pd.get_dummies()`
df = pd.get_dummies(df,columns=["sex",'embarked'], drop_first= True)
df

Unnamed: 0,passengerid,survived,pclass,name,age,sibsp,parch,ticket,fare,cabin,sex_male,embarked_Q,embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.2500,,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.9250,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1000,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35,0,0,373450,8.0500,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27,0,0,211536,13.0000,,1,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",19,0,0,112053,30.0000,B42,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",-999,1,2,W./C. 6607,23.4500,,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",26,0,0,111369,30.0000,C148,1,0,0


In [12]:
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'sex_male', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [14]:
# 2. Create a list of the features we are going to use.
selected_features = ['age',"fare",'pclass','sibsp', 'parch',"sex_male",'embarked_Q','embarked_S']

# Set X to be the features we are going to use.
X = df[selected_features]

# Set y to be our target variable. 
y = df["survived"]


In [15]:
# Split our data into trainig and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2)

# Print the length and width of our testing data.
print(X_train.shape, X_test.shape)

print('Lenght of our Training data: ' + str(X_train.shape), '\nLength of our Testing data: ' + str(X_test.shape))

(712, 8) (179, 8)
Lenght of our Training data: (712, 8) 
Length of our Testing data: (179, 8)


# #3 Build and train model.
1. For our first pass, initialize our model with `max_depth=2`.
2. Fit our model with our training data. 
3. Make predictions of our testing data. 
4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
    * To calculate auc score you have to get the predicted probabilites for the Survived class using `model.predict_proba(X_test)[:,1]`
5. Visualize our Decision Tree using provided code. 


In [16]:
# For our first pass, initialize our model with `max_depth=2`.

model = DecisionTreeClassifier(max_depth = 2)


In [17]:
# Fit our model with our training data. 
model.fit(X_train, y_train)

In [18]:
# Make predictions of our testing data. 

y_pred = model.predict(X_test)

In [19]:
# 4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_test,y_pred)
print("Precision Score: %f" % precision)

recall =recall_score( y_test,y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_test,y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = model.predict_proba(X_test)

# Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

# Compute auc score
auc = roc_auc_score(y_true=y_test,y_score=y_pred_proba)
print('AUC Score: %f' % auc)

Accuracy Score: 0.793296
Precision Score: 0.709677
Recall Score: 0.698413
F1 Score: 0.704000
AUC Score: 0.832581


# Visualize your tree

# Picking the right parameters...

# Parameter tuning of your Decision Tree using GridSearch or RandomizedSearch

### For assistance on this, look at Steves TA Tips code in `TA-Tips/random_forest_tuning.ipynb`


1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Visualize your best tree.
1. Which feature was your most important feature?

```python
tree.DecisionTreeClassifier(
    *,
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0,
)
```


[Tips on how to customize / set the paramters in the decision tree.](https://scikit-learn.org/stable/modules/tree.html#tips-on-practical-use)

In [20]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search.from sklearn.model_selection import GridSearchCV
params = { 
      'criterion':['gini','entropy','log_loss'], 
    'max_depth':[3,5,10],
    'splitter':['best','random'] }

In [21]:
# 1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
grid_search_cv =  GridSearchCV(DecisionTreeClassifier(),params)

In [22]:
# 1. Fit your GridSearchCV with your training data. 
grid_search_cv.fit(X_train,y_train)

In [23]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print(grid_search_cv.best_params_)

{'criterion': 'entropy', 'max_depth': 5, 'splitter': 'best'}


In [24]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you the best tree
model = DecisionTreeClassifier(criterion='entropy',
                               max_depth=5,
                               splitter='random')
model.fit(X_train,y_train)
# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_test, y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_test, y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_test, y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test,y_pred_proba)
print('AUC Score: %f' % auc)

Accuracy Score: 0.810056
Precision Score: 0.723077
Recall Score: 0.746032
F1 Score: 0.734375
AUC Score: 0.873495


In [25]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.DataFrame.from_dict( {'feature_importance': model.feature_importances_,
                                       'feature':selected_features }).sort_values('feature_importance', ascending=False)
feature_imp

Unnamed: 0,feature_importance,feature
5,0.527807,sex_male
2,0.267125,pclass
0,0.077153,age
7,0.040904,embarked_S
4,0.040029,parch
1,0.024095,fare
3,0.019206,sibsp
6,0.003682,embarked_Q


# Now onto Random Forests...
Were going to do the same with, but this time with a random forest. Remeber... Repetition is the father of learning.

1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Which feature was your most important feature?


# Parameters of the Random Forest Classifier

```python
RandomForestClassifier(
    n_estimators=100,
    *,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)
```

In [26]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
params = {
    'n_estimators':[100,500,1000],
    'criterion':['gini','entropy','log_loss'], 
    'max_depth':[3,5,10]
}

In [27]:
# 1. Initalize your GridSearchCV or RandomizedSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
model = GridSearchCV(RandomForestClassifier(),params)


In [28]:
# 1. Fit your GridSearchCV with your training data. 
model.fit(X_train,y_train)

In [29]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print(model.best_params_)




{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 1000}


In [30]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you tree that has the highest f1-score. 
model = grid_search_cv.best_estimator_


# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_test,y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_test,y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_test,y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test,y_pred_proba)
print('AUC Score: %f' % auc)

Accuracy Score: 0.787709
Precision Score: 0.698413
Recall Score: 0.698413
F1 Score: 0.698413
AUC Score: 0.840791


In [31]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.Series(model.feature_importances_,index=selected_features).sort_values(ascending=False)
feature_imp

sex_male      0.447835
pclass        0.193589
age           0.155837
fare          0.115127
parch         0.048174
sibsp         0.039439
embarked_Q    0.000000
embarked_S    0.000000
dtype: float64

# Build a random forest using the ny-vs-sf-housing.csv data. 
* Your target variable, aka the column you are trying to predict, aka your `y` variable is `in_sf`. 
* Can you get an accuracy above %88.8889?
* What was your most important feature?


In [32]:
df = pd.read_csv('data/ny-vs-sf-houses.csv')
df.head()

Unnamed: 0,in_sf,beds,bath,price,year_built,sqft,price_per_sqft,elevation
0,0,2.0,1.0,999000,1960,1000,999,10
1,0,2.0,2.0,2750000,2006,1418,1939,0
2,0,2.0,2.0,1350000,1900,2150,628,9
3,0,1.0,1.0,629000,1903,500,1258,9
4,0,0.0,1.0,439000,1930,500,878,10


In [None]:
# BUILD, TRAIN, AND EVAULATE A RANDOM FOREST MODEL BELOW. 



# Awesome difficult extra credit below:
Build a classifier using the adult_income.csv data.  
* The target variable is 'class'
* Start with just using these features `selected_features = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']`
* You have to include the pos_label in your precision, recall, and f1 scores. It just tells the classifier which one is the posotive label.  I provided the proper way below.

* See if you can get above 50% f1 score.  
* See some [super tricks and tips here](https://www.kaggle.com/code/jieyima/income-classification-model)

In [None]:
df = pd.read_csv('data/adult_income.csv')
df.head()