In [1]:
import numpy as np
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

In [2]:
import acquire
import prepare

In [3]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Modeling Exercises:

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

5. Run through steps 2-4 using a different max_depth value.

6. Which model performs better on your in-sample data?

7. Which model performs best on your out-of-sample data, the validate set?

____________________________

In [None]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.head()

In [None]:
#get value counts of survived (1) vs did not survive (0)
train.survived.value_counts()

### #1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
#create the baseline
train["baseline"] = 0
baseline_accuracy = (train.survived == train.baseline).mean()
print(f'Baseline accuracy is: {baseline_accuracy:.2%}')

_________________________

### #2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
#identify columns that you want to use

#only using the following columns
X_cols = ['pclass', 'fare', 'alone', 'Q', 'S']

#only trying to see who survived or died
y_col = 'survived'

In [None]:
#split your data - train, validate, test

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [None]:
#identify model 1 
model1 = DecisionTreeClassifier()

In [None]:
#fit model 1 using train data
model1.fit(X_train, y_train)

In [None]:
#get scores for train and validate, not using test yet
print(f'training score: {model1.score(X_train, y_train):.2%}')
print(f'validate score: {model1.score(X_validate, y_validate):.2%}')

In [None]:
#model 1 is the prediction
train['model1'] = model1.predict(X_train)
train.head()

_________________

### #3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
#model 1 score
# code == model1.score(X_train, y_train)
print(f'model1 score: {model1.score(X_train, y_train):.2%}')

In [None]:
# confusion matrix
pd.crosstab(train.survived, train.model1)

In [None]:
#classification report
print(classification_report(train.survived, train.model1, zero_division =True))

__________________

### #4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
#to get percentage, normalize=true
pd.crosstab(train.survived, train.model1, normalize=True)

positive- died (0)
<br>
negative - survived (1)

- **True positive** - 59.96%
- **False Postive** - 11.27% (predict they died, but they lived)
- **True Negative** - 25.96%
- **False Negative** - 1.81% (predict they lived, but they died)

In [None]:
pd.DataFrame(classification_report(train.survived, train.model1, zero_division =True, output_dict=True))

In [None]:
#Precision = TP / (TP+FP)
precision= 298 / (298+61)
precision

In [None]:
#computer is using the (1- survived) as a positive
precision_score(train.survived, train.model1)

In [None]:
#Recall = TP/ (TP+FN)
recall= 298/ (298+9)
recall

In [None]:
#computer is using the (1- survived) as a positive
recall_score(train.survived, train.model1)

In [None]:
#Accuracy = TP+TN/(TP +TN+FN+FP)
accuracy= (298+129) / (298+129+61+9)
accuracy

In [None]:
accuracy_score(train.survived, train.model1)

In [None]:
#OR make classification_report its own value
cr= pd.DataFrame(classification_report(train.survived, train.model1, zero_division =True, output_dict=True))

In [None]:
#call classification variable- column 1
cr['0']

_________________________

### #5. Run through steps 2-4 using a different max_depth value.

In [None]:
model2 = DecisionTreeClassifier(max_depth=1)
model2.fit(X_train, y_train)

plt.figure(figsize=(24,12))

plot_tree(model2, feature_names=X_train.columns.tolist(), class_names=['died', 'survived'])
plt.show()

In [None]:
#max_depth = 1 score
# code == model2.score(X_train, y_train)
print(f'model2 score: {model2.score(X_train, y_train):.2%}')

##### Model 2 gives a score of 63.18% accuracy

In [None]:
model3 = DecisionTreeClassifier(max_depth=3)
model3.fit(X_train, y_train)

plt.figure(figsize=(24,12))

plot_tree(model3, feature_names=X_train.columns.tolist(), class_names=['died', 'survived'])
plt.show()

In [None]:
#max_depth = 3 score
# code == model3.score(X_train, y_train)
print(f'model3 score: {model3.score(X_train, y_train):.2%}')

##### Model 3 gives a score of 69.01% accuracy

____________________

### #6. Which model performs better on your in-sample data?

Model 1 (with default max_depth=0) did the best of all three models with an accuracy of 85.92%

___________________________

### #7 Which model performs best on your out-of-sample data, the validate set?

In [None]:
model1.fit(X_validate, y_validate)

In [None]:
print(f'model1 score: {model1.score(X_validate, y_validate):.2%}')

In [None]:
model2.fit(X_validate, y_validate)

In [None]:
print(f'model2 score: {model2.score(X_validate, y_validate):.2%}')

In [None]:
model3.fit(X_validate, y_validate)

In [None]:
print(f'model3 score: {model3.score(X_validate, y_validate):.2%}')

#### Model 3 has the highest accuracy in validate set

<hr style="border:2px solid blue"> </hr>

# Random Forest Exercises:

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
<br>

2. Evaluate your results using the model score, confusion matrix, and classification report.
<br>

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
<br>

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.
<br>

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

In [None]:
from pydataset import data
from sklearn.model_selection import train_test_split
import pandas as pd
import acquire
import prepare

In [None]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())

In [None]:
#get to know data
train.shape, validate.shape, test.shape

In [None]:
#only using the following columns
X_cols = ['pclass', 'fare', 'alone', 'Q', 'S']

#only trying to see who survived or died
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [None]:
#look to see if we have nulls or columns to drop
train.info()

#data looks to be ready

In [None]:
#take a look at X_train
X_train.head()

In [None]:
y_train.head()

#### #1 Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [None]:
#make our thing
clf= RandomForestClassifier(min_samples_leaf = 1, max_depth = 10, random_state= 123)

In [None]:
#fit the thing (ONLY on train set!!)
clf.fit(X_train, y_train)

In [None]:
#use the thing (on training set)
y_pred = clf.predict(X_train)

In [None]:
#shows an array of y_predictions
#y_pred

#### #2 Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
#model score
clf.score(X_train, y_train)

In [None]:
#model score for validate set
clf.score(X_validate, y_validate)

In [None]:
train['model5'] = clf.predict(X_train)
train.head()

In [None]:
#confusion matrix- created manually with crosstab
pd.crosstab(train.survived, train.model5)

In [None]:
#get percentage to answer question
pd.crosstab(train.survived, train.model5, normalize=True)

In [None]:
#classification report
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

#### #3 Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print(f'Random Forest Model 5 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 5 Precision score is: {precision_score(train.survived, train.model5):.2%}')
print(f'Random Forest Model 5 Recall score is: {recall_score(train.survived, train.model5):.2%}')
print(f'Random Forest Model 5 F1 score is: {f1_score(train.survived, train.model5):.2%}')


In [None]:
#OR 
rf= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
rf['1']

- positive- died (1) 
- negative - survived (0)
<br>

- True positive - 27.57%
- False Postive - 10.66% (predict they lived, but they actually died)
- True Negative - 57.95%
- False Negative - 3.82% (predict they died, but they actually lived)

_________________________

#### #4 Run through steps increasing your min_samples_leaf and decreasing your max_depth

In [None]:
#second RF model
clf= RandomForestClassifier(min_samples_leaf = 3, max_depth = 15, random_state= 123)

In [None]:
#second RF model fit
clf.fit(X_train, y_train)

In [None]:
#second RF model fit
y_pred = clf.predict(X_train)

In [None]:
#second RF model train score
clf.score(X_train, y_train)

In [None]:
#second RF model validate score
clf.score(X_validate, y_validate)

In [None]:
#second RF model- insert column
train['model6'] = clf.predict(X_train)
train.head()

In [None]:
#second RF model confusion matrix
pd.crosstab(train.survived, train.model6)

In [None]:
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

In [None]:
print(f'Random Forest Model 6 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 6 Precision score is: {precision_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 Recall score is: {recall_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 F1 score is: {f1_score(train.survived, train.model6):.2%}')

__________________________________

In [None]:
#third RF model
clf= RandomForestClassifier(min_samples_leaf = 2, max_depth = 20, random_state= 123)

In [None]:
#third RF model fit
clf.fit(X_train, y_train)

In [None]:
#third RF model fit
y_pred = clf.predict(X_train)

In [None]:
#second RF model train score
clf.score(X_train, y_train)

In [None]:
#second RF model validate score
clf.score(X_validate, y_validate)

In [None]:
#third RF model- insert column
train['model7'] = clf.predict(X_train)
train.head()

In [None]:
#third RF model confusion matrix
pd.crosstab(train.survived, train.model7)

In [None]:
print(f'Random Forest Model 6 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 6 Precision score is: {precision_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 Recall score is: {recall_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 F1 score is: {f1_score(train.survived, train.model6):.2%}')

In [None]:
#third RF model
clf= RandomForestClassifier(min_samples_leaf = 1, max_depth = 13, random_state= 123)

In [None]:
#third RF model fit
clf.fit(X_train, y_train)

In [None]:
#third RF model fit
y_pred = clf.predict(X_train)

In [None]:
#second RF model score train set
clf.score(X_train, y_train)

In [None]:
#second RF model score with validate set
clf.score(X_validate, y_validate)

In [None]:
#third RF model- insert column
train['model7'] = clf.predict(X_train)
train.head()

In [None]:
#third RF model confusion matrix
pd.crosstab(train.survived, train.model7)

In [None]:
print(f'Random Forest Model 7 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 7 Precision score is: {precision_score(train.survived, train.model7):.2%}')
print(f'Random Forest Model 7 Recall score is: {recall_score(train.survived, train.model7):.2%}')
print(f'Random Forest Model 7 F1 score is: {f1_score(train.survived, train.model7):.2%}')

________________________

#### #5 What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- Having run through multiple max_depth and min_sample_leaf:

    - it appears that max_depth=13 is the optimal level, any max_depth above that gives SAME score.
    - min_sample_leaf=1 (defaul) gives the optimal level, any min_sample_leaf above that, will give a lower score.
    
<br>

- clf= RandomForestClassifier(min_samples_leaf = 1, max_depth = 13, random_state= 123) <--- this gives highest score of **85.92%** accuracy in train set

- I would use Recall because we do not want to miss any positive cases

<hr style="border:2px solid blue"> </hr>

# KNN Exercise (May 24)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import model_fun_cindy
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import numpy as np
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

import acquire
import prepare

import warnings
warnings.filterwarnings('ignore')

Continue working in your model file with the titanic dataset.

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps 2-4 setting k to 10

5. Run through setps 2-4 setting k to 20

6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

7. Which model performs best on our out-of-sample data from validate?

### #1 Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.head()

In [None]:
#only using the following columns
X_cols = ['pclass', 'fare', 'alone', 'age']

#only trying to see who survived or died
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

### #2 Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
#create the thing
knn = KNeighborsClassifier(n_neighbors=1)

#fit the thing
knn.fit(X_train, y_train)

#score the thing
knn.score(X_train, y_train)

In [None]:
for k in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    print(f'Model:{k}')
    print(f'training score: {knn.score(X_train, y_train):.2%}')
    print(f'validate score: {knn.score(X_validate, y_validate):.2%}')
    print(f'accuracy score: {knn.score(X_test, y_test):.2%}')
    print ('________________________')

In [None]:
#this shows that K of 13 has best accuracy
for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_test, y_test)
    print(f'{k:2d}: {accuracy:.2%}')

### #3 Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
#create the thing
knn = KNeighborsClassifier(n_neighbors=13)

#fit the thing
knn.fit(X_train, y_train)

#score the thing
knn.score(X_train, y_train)

In [None]:
model_fun_cindy.model_performs(X_train, y_train, knn)

### #4 Run through steps 2-4 setting k to 10

In [None]:
#create the thing
knn2 = KNeighborsClassifier(n_neighbors=10)

#fit the thing
knn2.fit(X_train, y_train)

#score the thing
knn2.score(X_train, y_train)

In [None]:
model_fun_cindy.model_performs(X_validate, y_validate, knn2)

### #5 Run through steps 2-4 setting k to 20

In [None]:
#create the thing
knn3 = KNeighborsClassifier(n_neighbors=20)

#fit the thing
knn3.fit(X_train, y_train)

#score the thing
knn3.score(X_train, y_train)

In [None]:
model_fun_cindy.model_performs(X_validate, y_validate, knn3)

### #6 What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
import matplotlib.pyplot as plt
k_range = range(1, 20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20])
plt.show()

### #7 Which model performs best on our out-of-sample data from validate?

In [None]:
model_fun_cindy.model_performs(X_validate, y_validate, knn)

___________

<hr style="border:2px solid blue"> </hr>

# Logistic Regression Exercises

1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

3. Try out other combinations of features and models.

4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import logistic_regression_util
import acquire
import prepare

In [9]:
df= acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [10]:
avg_age = df.age.mean()
df.age= df.age.fillna(avg_age)

#encode gender column
df['is_female'] = (df.sex == 'female')

In [12]:
df = df.drop(columns=['passenger_id', 'deck', 'class', 'embarked', 'sex', 'embark_town'])

In [16]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['age'] = imputer.transform(test[['age']])


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Q,S
583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0
337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0
50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1
218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0
31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0


In [17]:
#split into X and Y

#train set
X_train = train.drop(columns=['survived'])
y_train = train.survived

#validate set
X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived  

#test set
y_test = test.drop(columns=['survived'])
y_test = test.survived  

### ##1 

In [25]:
#create the thing
logit = LogisticRegression(random_state = 123)
features= ['age', 'pclass', 'fare']

In [26]:
#fit the thing
logit.fit(X_train[features], y_train)

LogisticRegression(random_state=123)

In [27]:
y_pred =logit.predict(X_train[features])

In [28]:
print('Logistic Regression using age, pclass, and fare features')
print('accuracy of Logistic Regression classifier on training set: {:.2f}'.format(logit.score(X_train[features], y_train)))

Logistic Regression using age, pclass, and fare features
accuracy of Logistic Regression classifier on training set: 0.72


### #2


In [22]:
#create the thing
logit1 = LogisticRegression(random_state = 123)

features= ['age', 'pclass', 'fare', 'is_female']

In [29]:
#fit the thing
#logit1.fit(X_train[features], y_train)
#^need to change is_female to int

In [None]:
print('Logistic Regression using age, pclass, fare and gender features')
print('accuracy of Logistic Regression classifier on training set: {:.2f}'.format(logit1.score(X_train[features], y_train)))

In [39]:
### #only age
features = ['age']
logit4 = LogisticRegression(random_state = 123, class_weight = 'balanced')
logit4.fit(X_train[features], y_train)
y_pred = logit4.predict(X_train[features])
accuracy = logit4.score(X_train[features], y_train)

print('Logistic Regression using only age feature')
print('accuracy of Logistic Regression classifier on training set: {:.2f}'.format(logit4.score(X_train[features], y_train)))

Logistic Regression using only age feature
accuracy of Logistic Regression classifier on training set: 0.52


________________________________________

### once you've used SEVERAL models and picked your top 3

- y_pred = logit1.predict(X_validate[features])
<br>

- print('Logit1 model is using "???" features, class_weight "???", and all hyperparameters as default')
<br>

- print(classification_report(y_validate, y_pred)

### do NOT fit again!