# Acquire Titanic Data and Prep

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import env
import acquire

In [None]:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

def get_titanic():
    my_query="SELECT * FROM passengers"
    df = pd.read_sql(my_query, get_connection('titanic_db'))
    return df

In [None]:
df = get_titanic()
df.head()

In [None]:
# drop rows where age or embarked is null, drop column 'deck', drop passenger_id
def prep_titanic(df):
    '''
    take in titanc dataframe, remove all rows where age or embarked is null, 
    get dummy variables for sex and embark_town, 
    and drop sex, deck, passenger_id, class, and embark_town. 
    '''
    df = (
        df[(df.age.notna()) & (df.embarked.notna())].
        drop(columns=['deck', 'passenger_id', 'class']))
    dummy_df = (
        pd.get_dummies(df[['sex', 'embark_town']], prefix=['sex', 'embark']))
    df = (
        pd.concat([df, dummy_df.drop(columns=['sex_male'])], axis=1).
        drop(columns=['sex', 'embark_town']))
    return df

In [None]:
df = prep_titanic(df)
df.head()

In [None]:
# dropped embarked column, because we can see where people embarked using the dummy columns.
df = df.drop(columns='embarked')
df.head()

In [None]:
df.info()

In [None]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [None]:
train, validate, test = train_validate_test_split(df, target='survived')
train.head()
# What are our observations?
# Each row is an observation, and each observation is a person on the titanic. 
# In tidy data, every row is an observation

In [None]:
print(train.shape, validate.shape, test.shape)

# Model Exercises

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
# since survived is our target variable, we will be looking at its value counts to create a baseline. 
df.survived.value_counts()
# 0 is most frequent, so that will be our baseline. 

In [None]:
df.isnull().sum()

In [None]:
df['baseline_prediction'] = 0
df.head(2)


In [None]:
# baseline accuracy
baseline_accuracy = (df.baseline_prediction == df.survived).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

### 2.Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
test.head()

In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

### Train Model

In [None]:
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

### Fit The Model

In [None]:
clf = clf.fit(X_train, y_train)

### Visualize a Decision Tree

In [None]:
import graphviz
from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree2', view=True)

### Make Predictions

In [None]:
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
### Estimate Probability

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

In [None]:
train.head()

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report

### Evaluate the model

In [None]:
# Compute Accuracy
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
# Confusion Matrix
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [None]:
y_train.head()

In [None]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
# Accuracy of model when run on the validate data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

In [None]:
# And since accuracy isn't everything

# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

### 6. Which model performs better on your in-sample data?



Model with a depth of 5

### 7. Which model performs best on your out-of-sample data, the validate set?



Model with a depth of 5

# Random Forest Exercises

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
train.head()

In [None]:
y_pred = rf.predict(X_train)
y_pred[:5]

In [None]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba[:5]

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}' .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

### 3.Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

# KNN Exercises

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)
y_pred

In [None]:
y_pred_proba = knn.predict_proba(X_train)

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}' .format(knn.score(X_train, y_train)))

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print(confusion_matrix(y_train, y_pred))

- TP = 204 (0, 0) Predicted Not Survived, Actual Not Survived
- FP = 33 (0, 1) Predicted Not Survived, Actual Survived
- FN = 49 (1, 0) Predicted Survived, Actual Not Survived
- TN = 112 (1, 1) Predicted Survived, Actual Survived

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}' .format(knn.score(X_validate, y_validate)))

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
    

4. Set K = 10

In [None]:
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
print('Accuracy of KNN classifier on training set: {:.2f}' .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}' .format(knn.score(X_validate, y_validate)))

5. Set k = 20

In [None]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
print('Accuracy of KNN classifier on training set: {:.2f}' .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}' .format(knn.score(X_validate, y_validate)))

6. What are the differences in the evaluation metrics? Which performs better on your in sample data? Why?

It looks like at K increases, accuracy drops. So k=5 is better than k=20. 

7. Which model performs best on our out-of-sample data from validate?

KNN with K=5

# Logistic Regession Exercises

 In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluating, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import acquire
import env

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer


In [2]:
def clean_titanic_data(df):
    '''
    This function will drop any duplicate observations, 
    drop columns not needed, fill missing embarktown with 'Southampton'
    and create dummy vars of sex and embark_town. 
    '''
    df.drop_duplicates(inplace=True)
    df.drop(columns=['deck', 'embarked', 'class'], inplace=True)
    df.embark_town.fillna(value='Southampton', inplace=True)
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    return pd.concat([df, dummy_df], axis=1)

In [3]:
def new_titanic_data():
    '''
    This function reads the titanic data from the Codeup db into a df,
    write it to a csv file, and returns the df.
    '''
    # Create SQL query.
    sql_query = 'SELECT * FROM passengers'
    
    # Read in DataFrame from Codeup db.
    df = pd.read_sql(sql_query, get_connection('titanic_db'))
    
    return df

In [4]:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    '''
    This function uses my info from my env file to
    create a connection url to access the Codeup db.
    It takes in a string name of a database as an argument.
    '''
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [5]:
df = new_titanic_data()
df.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0


In [6]:
df = clean_titanic_data(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,22.0,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,Southampton,1,1,0,1


In [7]:
df = df.drop(columns=['passenger_id', 'sex', 'embark_town'])
df.head(2)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0


In [8]:
df.isna().sum()

survived                     0
pclass                       0
age                        177
sibsp                        0
parch                        0
fare                         0
alone                        0
sex_male                     0
embark_town_Queenstown       0
embark_town_Southampton      0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   pclass                   891 non-null    int64  
 2   age                      714 non-null    float64
 3   sibsp                    891 non-null    int64  
 4   parch                    891 non-null    int64  
 5   fare                     891 non-null    float64
 6   alone                    891 non-null    int64  
 7   sex_male                 891 non-null    uint8  
 8   embark_town_Queenstown   891 non-null    uint8  
 9   embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 58.3 KB


In [10]:
df.age = df.age.fillna(value=df.age.median())
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   pclass                   891 non-null    int64  
 2   age                      891 non-null    float64
 3   sibsp                    891 non-null    int64  
 4   parch                    891 non-null    int64  
 5   fare                     891 non-null    float64
 6   alone                    891 non-null    int64  
 7   sex_male                 891 non-null    uint8  
 8   embark_town_Queenstown   891 non-null    uint8  
 9   embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 58.3 KB


In [11]:
def split_titanic_data(df):
    """
    splits the data in train validate and test 
    """
    train, test = train_test_split(df, test_size = 0.2, random_state = 123, stratify = df.survived)
    train, validate = train_test_split(train, test_size=.25, random_state=123, stratify=train.survived)
    
    return train, validate, test

In [12]:
train, validate, test = split_titanic_data(df)

In [13]:
train.shape, validate.shape, test.shape

((534, 10), (178, 10), (179, 10))

In [14]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
455,1,3,29.0,0,0,7.8958,1,1,0,0
380,1,1,42.0,0,0,227.525,1,0,0,0
492,0,1,55.0,0,0,30.5,1,1,0,1
55,1,1,28.0,0,0,35.5,1,1,0,1
243,0,3,22.0,0,0,7.125,1,1,0,1


In [15]:
# Make new dataframes
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [16]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
455,3,29.0,0,0,7.8958,1,1,0,0
380,1,42.0,0,0,227.525,1,0,0,0
492,1,55.0,0,0,30.5,1,1,0,1
55,1,28.0,0,0,35.5,1,1,0,1
243,3,22.0,0,0,7.125,1,1,0,1


# Setup Baseline

In [17]:
train.survived.value_counts()

0    329
1    205
Name: survived, dtype: int64

In [18]:
baseline_accuracy = (train.survived == 0).mean()
round(baseline_accuracy, 3)

0.616

1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [19]:
logit = LogisticRegression(random_state=123)
logit

LogisticRegression(random_state=123)

In [20]:
features = ['age', 'fare', 'pclass']
logit.fit(X_train[features], y_train)
y_pred = logit.predict(X_train[features])

print("Baseline =", round(baseline_accuracy, 2))
print("Logistic Regression using age, fare, and pclass features")
print("Accuracy of Logistic Regression classifier on training set: {:.2f}" 
     .format(logit.score(X_train[features], y_train)))
# .score gives me the accuracy

Baseline = 0.62
Logistic Regression using age, fare, and pclass features
Accuracy of Logistic Regression classifier on training set: 0.71
