# Import packages and verify versions

In [None]:
import pandas as pd
import sklearn
print('The pandas version is {}.'.format(pd.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

# Load data

### Import necessary IBM CoS libraries and insert customers.csv into the data frame.

This can be accomplished within Watson Studio on the right hand menu, find the applicable file and chose Pandas Data Frame

*Default is to insert into a data frame named df_data_1. When using a templated notebook, make sure that the cells below reference the correct data frame name OR change this cell to name the data frame in the same name used below*

### Insert the transactions.csv into a data frame

The same note regarding the default data frame name applies here. The libraries do not need to be re-imported (though the insert to code from the right hand menu may still add the import statement.)

### Print the number of rows for the customer data frame & transaction data frame
This is to confirm that the two data frames will join (same # of rows)
(hint: use the same key column to ensure the same number of rows)

# Join Data

Join the Customer and Transaction Data Frames together. Identify which key should be used and the type of join (inner, outer) to be used. *Do not forget to drop the index*

Display the number of rows joined and examine the newly joined data set. 
*Number of rows joined should be 788*

# Analyze Data

Look for column outliers or other quality issues, such as data type, that will affect the end model

*Should be clear to see that one of the columns contains data outliers*

### Check your data frame's data types

Are the types correct? You can either check the entire data frame, or specific columns that may have been identified from the df.head() command ran earlier following the join

# Data Cleansing

### Let's start with dropping rows that have invalid data

We do this so that we don't use resources cleansing rows that we will ultimately discard due to invalid data

*We may also want to save the rows with invalid data for correction and then re adding them. For the purpose of this lab, the data will just be dropped*

Show rows with invalid data

Drop the rows with invalid data

Display the remaining number of rows in the data set. *Should be 761*

### Now we want to correct the data type of the column identified above.

Convert String Data to Numeric data using the .to_numeric function of pandas.

In [None]:
# Force the Frequency_score column to a numeric data type as it should be


### Check the new data type of the column we changed 

We want to make sure that the new data type is inline with similar columns

In [None]:
#Frequency_score is float data type, but should be integer


### Cast the column to the correct data type 
*hint - should be the same dat type as a similarly named column*

In [None]:
# Cast Frequency_score as integer


### Now we want to remove the outliers from the column identified above in the .decribe() command
we can see that 75% of the rows fall within the 1-4 number range. We want to capture as much of the data as applicable, so let's drop any rows that are less than 5

In [None]:
# Remove the rows with outliers in Monetary_score that we previously identified


### Let's spot check the new data frame and the specific column we removed the outliers from

# Now lets drop columns not being used as features
*The columns not being used are CustomerID, Invest, Educ, MARTIAL, TimeYears, Lasttrans, current, Monetary_score*

Once dropped, we will want to check the new data frame

# Convert numeric data to integer (some numeric columns were inferred as float64)
First, lets check the data type of the column(s).

In [None]:
# Retire column was infered as a float data type


Let's create a list of column names based on numeric data types (*hint float64 & int64). We also want to create a list of the correct data type to correspond with the list of column names

Create a dictionary that will be used to set the numeric columns to integer type & Print the dictionary

Convert the numeric columns to integer

Check the data types to insure all numeric data is int64

# Encode the string data

In [None]:
#install sklearn-pandas package that will be used to encode the categorical features
!pip install sklearn-pandas
from sklearn.preprocessing import LabelEncoder
from sklearn_pandas import DataFrameMapper

Encode the Churn label calling the new column CHURN and drop the original Churn column

Apply the LabelEncoder to encode the categorical features

# Prepare the data for machine learning

1. Split the label column out from the features dataframe
2. Sample the indexed DataFrame
3. Create a separate DataFrame from the label column and sample

In [None]:
# Split the label column out from the features dataframe


# Sample the indexed DataFrame


# Create training and test datasets¶

Split X and y Data Frames into training and testing sets - accomplished by importing test_train_split from the sklearn.model_selection library (code already included)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split


Show the number of rows in each data set

# Train the model

The eval_metric parameter specifies the evaluation metrics for validation data
We will be using a Binary classification error rate. It is calculated as # of wrong cases divided by # of all cases. *(#wrong/#all)*
For the predicitons, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances and
the others as negative instances. 

In [None]:
import sklearn.pipeline
from xgboost import XGBClassifier

#Set the classifer, the stpes and the pipeline


#Train the model


### Show model training parameters 

# Check model accuracy

1. import the required modules from the scikit-learn metrics package *(provided)*
2. make predictions for the test data
3. convert the numpy array provided the from the precit function to a list
4. evaluate the predictions

In [None]:
#import required modules from the scikit-learn metrics package
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [None]:
# make predictions for test data

# Convert numpy array to list


In [None]:
# evaluate predictions

# Show the accuracy


# Analyze Model - Feature Importance and Trees

In [None]:
# Show the feature importances - will result in a list of decimals


In [None]:
# Print the importance based on columns


In [None]:
#Import libraries for plot importance visualization
from xgboost import plot_importance
plot_importance(XGBClassifier)

### Install graphviz for visual analysis & import matplotlib.pylot *(provided)* 

In [None]:
!pip install graphviz
import matplotlib.pyplot as plt
%matplotlib inline

# Investigate model

In [None]:
# Plot and display the performance evaluation


# Set the figure for matplotlib figures (provided)
fig, ax = plt.subplots(1, 1, sharex=True, figsize=(8, 6))

ax.plot(eval_steps, [1-x for x in eval['validation_0']['error']], label='Train')
ax.plot(eval_steps, [1-x for x in eval['validation_1']['error']], label='Test')
ax.legend()
ax.set_title('Accuracy')
ax.set_xlabel('Number of iterations')

# Avoid Overfitting By Limiting Number of Trees

In [None]:
# ntree_limits the number of trees in the prediction; defaults to 0 (use all trees), we want to increase the limit to 10


In [None]:
# Check the accuracy of the trained model


# Avoid Overfitting By Early Stopping

In [None]:
# Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training
# Returns the model from the last iteration (not the best one) 


In [None]:
# Show best score


In [None]:
# Show best number of trees


In [None]:
# Check the accuracy of the trained model with early stopping


# Plot Model Performance

In [None]:
# Print the confusion matrix


In [None]:
# Plot the confusion matrix
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve (Logistic Regression)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

In [None]:
# Print out AUC, the percentage of the ROC plot that is underneath the curve
