# In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [1]:
import pydataset
from env import get_db_url

import pandas as pd
import numpy as np

from prepare import prep_titanic
from prepare import titanic_split

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from acquire import get_titanic_data

In [2]:
df = prep_titanic()
df['baseline_prediction'] = 0
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S,baseline_prediction
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1,0
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1,0
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1,0
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1,0


In [3]:
og_df = get_titanic_data()
og_df.head(100)
og_df.age[df['age'].notnull()].median()

28.0

In [4]:
df['age'] = df['age'].fillna(28.00)

In [5]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

## Baseline Accuracy

In [6]:
baseline_accuracy = (df.survived == df.baseline_prediction).mean()
baseline_accuracy

0.6161616161616161

In [7]:
df['fare'] = df['fare'].astype('float64')

In [8]:
# df.pclass.get_dummies()
# Assuming df is your DataFrame
dummy_df = pd.get_dummies(df['pclass'], prefix='pclass')

# Concatenate the dummy variables with the original DataFrame
df = pd.concat([df, dummy_df], axis=1)

In [9]:
df.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'alone', 'sex_male', 'embarked_Q', 'embarked_S',
       'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'],
      dtype='object')

## Train, Validate, Test

In [10]:
df, train, validate, test = titanic_split(df)

In [11]:
# inspect
train.info(), validate.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 455 to 496
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   passenger_id         534 non-null    int64  
 1   survived             534 non-null    int64  
 2   pclass               534 non-null    int64  
 3   sex                  534 non-null    object 
 4   age                  534 non-null    float64
 5   sibsp                534 non-null    int64  
 6   parch                534 non-null    int64  
 7   fare                 534 non-null    float64
 8   embarked             534 non-null    object 
 9   alone                534 non-null    int64  
 10  sex_male             534 non-null    uint8  
 11  embarked_Q           534 non-null    uint8  
 12  embarked_S           534 non-null    uint8  
 13  baseline_prediction  534 non-null    int64  
 14  pclass_1             534 non-null    uint8  
 15  pclass_2             534 non-null    u

(None, None, None)

## * For all of the models you create, choose a threshold that optimizes for accuracy.

# Create a new notebook, logistic_regression, use it to answer the following questions:

## 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

## Removing the lesser columns

In [12]:
# create X & y version of train/validate/test
# where X contains the features we want to use and y is a series with just the target variable

X_train = train.drop(columns=['passenger_id', 'survived', 'sex', 'sibsp', 'parch', 'embarked', 'alone', 'sex_male', 'embarked_Q', 
        'embarked_S', 'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'])
y_train = train.survived
X_validate = validate.drop(columns=['passenger_id', 'survived', 'sex', 'sibsp', 'parch', 'embarked', 'alone', 'sex_male', 'embarked_Q', 
        'embarked_S', 'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'])
y_validate = validate.survived
X_test = test.drop(columns=['passenger_id', 'survived', 'sex',  'sibsp', 'parch', 'embarked', 'alone', 'sex_male', 'embarked_Q', 
        'embarked_S', 'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'])
y_test = test.survived

In [13]:
def Log_Regression(c, k, X, y):
    # c=C
    # k=sclaing
    logit = LogisticRegression(C=c, 
                            class_weight={0:1, 1:99}, 
                            random_state=123, 
                            intercept_scaling=k, 
                            solver='lbfgs')

    
    logit.fit(X, y)
    
    y_trn_pred = logit.predict(X)
    
    y_val_pred = logit.predict(X_validate)
    
    y_trn_proba = logit.predict_proba(X)
    
    y_val_proba = logit.predict_proba(X_validate)
    
    
    return y_trn_pred, y_val_pred, y_trn_proba, c, logit

In [14]:
y_trn_pred, y_val_pred, y_trn_proba, c, logit = Log_Regression(1, 1, X_train, y_train)

## X_train Accuracy VS Baseline Accuracy

In [15]:
X_train.columns

Index(['pclass', 'age', 'fare'], dtype='object')

In [16]:
trn_score = logit.score(X_train, y_train)
trn_score, baseline_accuracy

(0.3838951310861423, 0.6161616161616161)

### * conclusion: This model preforms less than baseline.

## 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [17]:
X_train['sex_male'] = df['sex_male']
X_validate['sex_male'] = df['sex_male']
X_test['sex_male'] = df['sex_male']

In [18]:
X_train.columns, X_validate.columns, X_test.columns

(Index(['pclass', 'age', 'fare', 'sex_male'], dtype='object'),
 Index(['pclass', 'age', 'fare', 'sex_male'], dtype='object'),
 Index(['pclass', 'age', 'fare', 'sex_male'], dtype='object'))

## X_train Accuracy VS Baseline Accuracy

In [20]:
y_trn_pred, y_val_pred, y_trn_proba, c, logit = Log_Regression(1, 1, X_train, y_train)

In [21]:
trn_score = logit.score(X_train, y_train)
trn_score, baseline_accuracy

(0.3838951310861423, 0.6161616161616161)

## 3. Try out other combinations of features and models.

### * Make all possible combnations

In [29]:
c = [.001, .01, .1, 1, 10, 100, 1000]


# storing my values to use them to create dictiojnaries containg 
# all the collected data for head to head comaprison
coefficient_list = []
train_score_list = [] 
val_score_list = []
trn_report_list = []
val_report_list = []
diff_list = []

for c in c:
        
    y_trn_pred, y_val_pred, y_trn_proba, c, logit = Log_Regression(1, 1, X_train, y_train)
#         trn_score, trn_cm, trn_cmdf, trn_report = decision_metrics(X_train, y_train, y_pred, k, l)
#         val_score, val_cm, val_cmdf, val_report = decision_metrics(X_validate, y_validate, y_pred, k, l)
    # Compute score
    trn_score = logit.score(X_train, y_train)
    val_score = logit.score(X_validate, y_validate)
    trn_report = classification_report(y_train, y_trn_pred)
    val_report = classification_report(y_validate, y_val_pred)
#         trn_report_list.append(trn_report)
    val_report_list.append(val_report)
    train_score_list.append(trn_score)
    val_score_list.append(val_score)
    coefficient_list.append(c)


    print(f'Training Dataset Model Coefficient of {c},')

    # Print the model's accuracy and other information
    print(f"Model's Accuracy: {trn_score}")
    print(f"Difference between Model and Basleine Accuracy: {trn_score - baseline_accuracy}")
    print('Train Classification Report')
    print(trn_report)
    print()
    print('           VS             ')
    print()
    print(f'Validation Dataset Model Coefficient of {c},')
    print(f"Model's Accuracy: {val_score}")
    print(f"Difference between Model and Basleine Accuracy: {val_score - baseline_accuracy}")
    print('Validate Classification Report')
    print(val_report)
    print()
    print(f'Difference bewtween Training and Validate:{trn_score-val_score}')
    #print(val_report)
    print()
    print('----------------------------------------------------')
    print()

    # Increment 'c' for the next iteration
    c *= 10

    diff = trn_score-val_score
    diff_list.append(diff)

# create dictionaries for my data collection
train_dict = {x : y for x, y in zip(coefficient_list,  train_score_list)}
val_dict = {x : y for x, y in zip(coefficient_list, val_score_list)}
diff_dict = {x : y for x, y in zip(coefficient_list, diff_list)}

model_number = 1
for key, value in diff_dict.items():
    if value == min(diff_dict.values()):
        print()
        print(f'Top Model #{model_number}')
        print('*************FINAL ANALYSIS*************')
        print()
        print(f'''Top performing Training Model:
        Max Depth:{key}
        Accuracy:{max(train_dict.values())}
        
Top performing Validation Model:

        Max Depth:{key}
        Accuracy:{max(val_dict.values())}
        Difference:{min(diff_dict.values())}''')
        model_number+=1

Training Dataset Model Coefficient of 1,
Model's Accuracy: 0.3838951310861423
Difference between Model and Basleine Accuracy: -0.23226648507547382
Train Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       329
           1       0.38      1.00      0.55       205

    accuracy                           0.38       534
   macro avg       0.19      0.50      0.28       534
weighted avg       0.15      0.38      0.21       534


           VS             

Validation Dataset Model Coefficient of 1,
Model's Accuracy: 0.38202247191011235
Difference between Model and Basleine Accuracy: -0.23413914425150378
Validate Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       110
           1       0.38      1.00      0.55        68

    accuracy                           0.38       178
   macro avg       0.19      0.50      0.28       178
weighted avg    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## 4. Use you best 3 models to predict and evaluate on your validate sample.

## 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?