In [1]:
## download our dataset
#!kaggle datasets download -d blastchar/telco-customer-churn

In [2]:
## lets unzip the file
#!unzip telco-customer-churn.zip 

### Initial data preparation 

In [3]:
## import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline 

In [4]:
## read the dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [5]:
## lets check the number of rows in the dataset
len(df)


7043

In [6]:
## lets examine a couple of rows 
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
## lets transpose the dataframe to view all columns
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [8]:
## lets check the column types of the dataframe
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [9]:
## let change the column type of TotalCharges to numeric

df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)

In [10]:
## lets normalize the data in our dataframe
## change the column names to lowercase and replace spaces with underscore

df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
## lets select string columns and normalize their data
## change the data to lower case and replaces all spaces with underscore

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [12]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                object
dtype: object

In [13]:
## lets convert our target variable to numeric

df.churn = (df.churn == 'yes').astype(int)
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

In [14]:
## lets divide our dataset for the purposes of training into
## training set 60%
## validation set 20%
## test set 20%
from sklearn.model_selection import train_test_split

df_train_full, df_valid = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_test = train_test_split(df_train_full, test_size=0.33, random_state=11)



In [15]:
print(f'Training dataset: {len(df_train)}')
print(f'Testing dataset: {len(df_test)}')
print(f'Validation dataset: {len(df_valid)}')


Training dataset: 3774
Testing dataset: 1860
Validation dataset: 1409


In [16]:
## lets select the target column from the dataset
y_train = df_train.churn.values
y_valid = df_valid.churn.values
y_test = df_test.churn.values 

In [17]:
## lets deletes the target column from the dataset
del df_train['churn']
del df_valid['churn']
del df_test['churn']

### Exploratory data analysis

In [18]:
## lets check for missing values in the dataset
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [19]:
## lets check the distribution of the values in the target variables
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [20]:
## check the proportion of churned users against all customers
1521 / 5634

0.26996805111821087

In [21]:
## lets use the mean method to calculate the churn rate
global_churn_rate = df_train_full.churn.mean()
round(global_churn_rate, 2)

0.27

In [22]:
## lets create a list of categorical and numerical varialbes in the dataset

categorical_var = ['gender', 'seniorcitizen', 'partner', 'dependents',
                   'phoneservice', 'multiplelines', 'internetservice', 
                   'onlinesecurity', 'onlinebackup', 'deviceprotection',
                 'techsupport','streamingtv', 'streamingmovies', 
                 'contract', 'paperlessbilling', 'paymentmethod']

numerical_var = ['tenure', 'totalcharges', 'monthlycharges']

In [23]:
## let check how many unique values each categorical variables has
df_train_full[categorical_var].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

## Feature Engineering 

### One-hot encoding for categorical variables
- The goal is to convert our categorical variables into matric form, or encode it.
- We are going to use a DictVectorizer that takes dictionary and vectorizes it (creates vectors)
- We first begin by converting our dataframe to a list of dictionaries. 
- Next we use the DictVectorizer, by creating an instance of it and fitting it to the list of dictionaries. 
- Lastly, we use the transform method to convert the dictionaries to matrix form

In [27]:
## convert dataframe into a list of dictionary
train_dict = df_train[categorical_var + numerical_var].to_dict(orient='records')

In [28]:
##import DictVectorizer
from sklearn.feature_extraction import DictVectorizer

## create an instance of DictVectorizer and 
## fit it to the list of dictionaries
dict_vec = DictVectorizer(sparse=False)
dict_vec.fit(train_dict)

## lets convert the dictionaries into matrix form
X_train = dict_vec.transform(train_dict)

In [29]:
## get the feature names of all columns
#dict_vec.get_feature_names()

## Machine learning for classification

### Logistic regression
- Logistic regression model is a classification model that can be use to predict a number. 
- It performs binary classification on the target variable.
- The output of a logistic regression is a probability: Meaning the prob that the observation is either positve(1) or negative(0) <br>


### Training a logistic regression

In [30]:
## lets import Logistic Regression model
from sklearn.linear_model import LogisticRegression

## lets create an instance of LogReg
clf = LogisticRegression(solver='liblinear', random_state=1)
## fit it tot he data
clf.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

### Lets check how well our model performed. 
For us to check our well our model performed, we have to use  the validation data to obtain the probability of churn for each |customer in the validation dataset. 

In [31]:
## lets convert our valid dataframe into matrix form

## convert to a list of dictionaries
valid_dict = df_valid[categorical_var + numerical_var].to_dict(orient='records')
## convert to a matrix form
X_valid = dict_vec.transform(valid_dict)

In [32]:
## lets make our predictions
y_pred = clf.predict_proba(X_valid)[:,1]

In [33]:
## lets convert our predictions (soft) into a hard predictions
## by setting a threshold value for churn

churn = y_pred >= 0.5 


In [34]:
## lets cal the accuracy of our model 
## by comparing it predictions to our actual value

(y_valid == churn).mean()

0.8069552874378992

### Using the model 

In [35]:
## lets get the customer data
customer = {
'customerid': '8879-zkjof',
'gender': 'female',
'seniorcitizen': 0,
'partner': 'no',
'dependents': 'no',
'tenure': 41,
'phoneservice': 'yes',
'multiplelines': 'no',
'internetservice': 'dsl',
'onlinesecurity': 'yes',
'onlinebackup': 'no',
'deviceprotection': 'yes',
'techsupport': 'yes',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'one_year',
'paperlessbilling': 'yes',
'paymentmethod': 'bank_transfer_(automatic)',
'monthlycharges': 79.85,
'totalcharges': 3320.75,
}

In [36]:
## lets convert this dictionary to a matrix 
X_test = dict_vec.transform([customer])
clf.predict_proba(X_test)[0, 1]

0.07332111084949638

In [37]:
customer = {
'gender': 'female',
'seniorcitizen': 1,
'partner': 'no',
'dependents': 'no',
'phoneservice': 'yes',
'multiplelines': 'yes',
'internetservice': 'fiber_optic',
'onlinesecurity': 'no',
'onlinebackup': 'no',
'deviceprotection': 'no',
'techsupport': 'no',
'streamingtv': 'yes',
'streamingmovies': 'no',
'contract': 'month-to-month',
'paperlessbilling': 'yes',
'paymentmethod': 'electronic_check',
'tenure': 1,
'monthlycharges': 85.7,
'totalcharges': 85.7
}


In [38]:
X_test = dict_vec.transform([customer])
clf.predict_proba(X_test)[0,1]

0.8321656556055403

### Target variable analysis

### Classification accuracy
Accuracy is define as the percentage of the correct predictions a model makes. 

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
thresholds = np.linspace(0, 1, 11)

In [None]:
for t in thresholds:
    churn = y_pred >= t
    acc = accuracy_score(y_valid, churn)
    print('%0.2f %0.3f' % (t, acc))

In [None]:
## lets make a plot of accuracy vs thresholds
thresholds = np.linspace(0, 1, 11)
accuracies = []

for t in thresholds:
    churn = y_pred >= t
    acc = accuracy_score(y_valid, churn)
    accuracies.append(acc)

In [None]:
plt.plot(thresholds, accuracies)



### Dummy baseline 

### Confusion table 
A confusion table refers to a table that concisely represents 
every possible outcome for our model predictions. <br>

TRUE POSITIVE: We predicted a true outcome, and it turned out to be true. <br>
FALSE POSITIVE: We predicted a false outcome, and it turned out to be false. <br>
TRUE NEGATIVE: We predicted a false outcome, and it turned out to be true. <br>
FALSE NEGATIVE: We predicted a false outcome, and it turned out to be false. <br>

### Calculating the confusion table with NumPy


In [None]:
## makes predictions at threshold 0.6
t = 0.6 
predicted_churn = (y_pred >= t)
predicted_no_churn = (y_pred < t)

## gets the actual target values 
actual_churn = (y_valid == 1)
actual_no_churn = (y_valid == 0)

## cal true positives 
true_positive = (predicted_churn & actual_churn).sum()
## cal true negatives
false_positive = (predicted_churn & actual_no_churn).sum()

## cal false positives
true_negative = (predicted_no_churn & actual_no_churn).sum()

## cal false negatives
false_negative = (predicted_no_churn & actual_churn).sum()

In [None]:
## lets puts all in a confusion table
confusion_table = np.array([
    [true_negative, false_negative],
    [false_positive, true_positive]
])

print(confusion_table)

In [None]:
confusion_table / confusion_table.sum()

### Precision and Recall
- Precision <br>
The precision of a model tells us how many of the positive predictions turned out to be correct. <br>
It is the fraction of correctly predicted positive examples <br>
P = TP / (TP + FP) <br>

- Recall
Recall is the fraction of correctly classified positive examples among all positive examples. <br>
R = TP / (TP + FN)

In [None]:
## lets cal the precision score
precision = true_positive / (true_positive + false_positive)
print(f'Prrcision Score: {round(precision, 2) * 100}%')

In [None]:
## lets cal the recal score
recall = true_positive / (true_positive + false_negative)
print(f'Recall Score: {round(recall, 2) * 100}%')

## ROC curve and AUC score
The receiver operating characteristic is use to show how a model sperate two classes, positive and negative. <br>
To do this, ROC needs two metrics, which are the true positive rate (TPR) and the false positive rate (FPR). <br>

### True positive rate and false positive rate 

- TPR: is the fraction of true positive among all positive examples. 
TPR = TP / (TP + FN) 
- FPR: is the fraction of false positive among all negative examples. 

FPR = FP / (FP + TN)

In [None]:
### 
true_positive_rate = true_positive / (true_positive + false_negative)
print(f'TPR: {round(true_positive_rate,2) * 100} %')
##
false_positive_rate = false_positive / (false_positive + true_negative)
print(f'TPR: {round(false_positive_rate, 2) * 100} %')

### Evaluating a model at multiple thresholds

In [None]:
scores = []

## lets create an array with different threshold values
## and loop over them
thresholds = np.linspace(0, 1, 101)

for t in thresholds:
    ## lets computes the confusion table for predictions at each threshold 
    tp = ((y_pred >= t) & (y_valid == 1)).sum()
    fp = ((y_pred >= t) & (y_valid == 0)).sum()
    fn = ((y_pred < t) & (y_valid == 1)).sum()
    tn = ((y_pred < t) & (y_valid == 0)).sum()
    
    ##lets append the results to the score list
    scores.append((t, tp, fp, fn, tn))

In [None]:
## lets converts the list to a list of tup;es
df_scores = pd.DataFrame(scores)
df_scores.columns = ['threshold', 'tp', 'fp', 'fn', 'tn']

In [None]:
df_scores[::10]

In [None]:
## lets compute the TPR and FPR scores 
df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)

In [None]:
df_scores[::10]

In [None]:
##
plt.plot(df_scores.threshold, df_scores.tpr, label='TPR')
plt.plot(df_scores.threshold, df_scores.fpr, label='FPR')
plt.legend()

### Random baseline model 
A random model outputs a random score between 0 and 1, regardless of the input. 

In [None]:
np.random.seed(1)
##generates an array with random numbers b/n 0 and 1
y_rand = np.random.uniform(0, 1, size=len(y_valid))

In [None]:
##function for calculating TPR and FPR at different thresholds 
def tpr_fpr_dataframe(y_val, y_pred):
    scores = []
    thresholds = np.linspace(0, 1, 101)
    
    for t in thresholds:
    ## lets computes the confusion table for predictions at each threshold 
        tp = ((y_pred >= t) & (y_valid == 1)).sum()
        fp = ((y_pred >= t) & (y_valid == 0)).sum()
        fn = ((y_pred < t) & (y_valid == 1)).sum()
        tn = ((y_pred < t) & (y_valid == 0)).sum()
    
        ##lets append the results to the score list
        scores.append((t, tp, fp, fn, tn))
    
    ## lets converts the list to a list of tup;es
    df_scores = pd.DataFrame(scores)
    df_scores.columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
    
    ## lets compute the TPR and FPR scores 
    df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
    df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)
    
    return df_scores
    

In [None]:
df_rand = tpr_fpr_dataframe(y_valid, y_rand)

In [None]:
np.random.seed(1)
y_rand = np.random.uniform(0, 1, size=len(y_valid))
df_rand = tpr_fpr_dataframe(y_valid, y_rand)
df_rand[::10]

In [None]:
plt.plot(df_rand.threshold, df_rand.tpr, label='TPR')
plt.plot(df_rand.threshold, df_rand.fpr, label='FPR')
plt.legend()

### The ideal model 
The ideal model always makes correct decision

In [None]:
num_neg = (y_valid == 0).sum()
num_pos = (y_valid == 1).sum()

y_ideal = np.repeat([0, 1], [num_neg, num_pos])
y_pred_ideal = np.linspace(0, 1, num_neg + num_pos)

df_ideal = tpr_fpr_dataframe(y_ideal, y_pred_ideal)

In [None]:
plt.plot(df_ideal.threshold, df_ideal.tpr, label='TPR')
plt.plot(df_ideal.threshold, df_ideal.fpr, label='FPR')
plt.legend()

### ROC Curve 
The goal is to have our model between these two curves (ideal and random model). <br>
We want our model to be as close to the ideal curve as possible and as far as possible <br>
from the random curve. 

We create an ROC curve by plotting the FRR and TPR against each other. 
For comparison, we also add the ideal andrandom models to the plot: <br>


In [None]:
plt.figure(figsize=(5, 5))

plt.plot(df_scores.fpr, df_scores.tpr, label='Model', color='black')
plt.plot(df_rand.fpr, df_rand.tpr, label='Random', linestyle='dashed' )
plt.plot(df_ideal.fpr, df_ideal.tpr, label='Ideal', linestyle='solid')

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')


plt.legend()

In [None]:
#help(plt.xlabel)
#help(plt.plot)

In [None]:
plt.figure(figsize=(5, 5))
plt.plot(df_scores.fpr, df_scores.tpr, label='Model')
plt.plot([0,1], [0, 1])

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')

plt.title('ROC Curve')

plt.show()

### ROC Curve Plotting Using Scikit-learn 

### Area under the ROC curve (AUC)
When evaluating our model using the ROC curve, we want the model to be as close to ideal spot,<br>
and as far from the random baseline as possible. <br>
We can use the measure the area under the ROC curve to quantify this "closeness",<br>
AUC is also a metric for evaluating the performance of a binary classification model. <br> 
Note:  An AUC of 0.9 is indicative of a reasonably good model; 0.8 is <br>
okay, 0.7 is not very performant, and 0.6 indicates quite poor performance. <br>
ROC curves and AUC scores tell us how well the model separates positive and negative examples. <br>
What is more, AUC has a nice probabilistic interpretation: it tells us what <br> 
the probability is that a randomly selected positive example will have a score higher<br>
than a randomly selected negative example.<br>

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_valid, y_pred)

In [None]:
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label='Model')
plt.plot([0,1], [0, 1])

plt.xlabel('True positive rate')
plt.ylabel('False positive rate')

plt.show('ROC Curve')

plt.show()

### Using ROC Curve To Compare Multiple Models

In [None]:
fpr_large, tpr_large, _ =  roc_curve(y_valid, y_pred)

#fpr_small, tpr_small, _ = roc_curve(y_valid, y_pred_small)

In [None]:
plt.figure(figsize=(5,5))

plt.plot(fpr_large, tpr_large, label='Large')
#plt.plot(fp)
plt.plot([0, 1], [0, 1])

plt.xlabel('True Positive Rate')
plt.ylabel('False Positive Rate')

plt.title('ROC Curve')
plt.show()

In [None]:
#Method 1
from sklearn.metrics import auc
auc(df_scores.fpr, df_scores.tpr)

In [None]:
#Method 2
from sklearn.metrics import roc_auc_score

roc_auc_score(y_valid, y_pred)

In [None]:
## Parameter tuning 


In [None]:
### K-fold cross-validation

In [None]:
## create a function for model triaing by 
## converts the data into one-hot encoding representation
## trains the model

def train(df, y):
    cat = df[categorical_var + numerical_var].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(cat)
    
    X = dv.transform(cat)
    
    log_reg_model = LogisticRegression(solver='liblinear')
    log_reg_model.fit(X, y)
    
    return dv, log_reg_model

In [None]:
## create a prediction function 
##by taking in dataframe with customers 
## vectorizer we trained previously
## model 
## Then we apply the vectorizer to the dataframe, get a
## matrix, and finally apply the model to the matrix to get predictions
def predict(df, dv, log_reg_model):
    """
    
    
    """
    cat = df[categorical_var + numerical_var].to_dict(orient='records')
    
    X = dv.transform(cat)
    y_pred = log_reg_model.predict_proba(X)[:, 1]
    
    return y_pred 

In [None]:
## implement K-fold cross-validation
## import kfolds
from sklearn.model_selection import KFold 

## splits the data into 10 parts
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

##creates a list for storing results
aucs = []
##iterates over the 10 different splits of the data
for train_idx, valid_idx in kfold.split(df_train_full):
    
    ## splits the data into train and validation sets
    df_train = df_train_full.iloc[train_idx]
    df_valid = df_train_full.iloc[valid_idx]
    
    
    y_train = df_train.churn.values
    y_valid = df_valid.churn.values 
    
    ##train the model and makes predictions
    dv, log_reg_model = train(df_train, y_train)
    y_pred = predict(df_valid, dv, log_reg_model)
    
    ## evaluates the quality of the train model on the validation data using auc
    auc = roc_auc_score(y_valid, y_pred)
    ##saves the auc to the list with the results
    aucs.append(auc)
    