In [None]:
# Import packages for data analysis

# Check the versions of key python libraries
# scipy
import scipy
print('scipy: %s' % scipy.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# matplotlib
import matplotlib
print('matplotlib: %s' % matplotlib.__version__)
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# statsmodels
import statsmodels
print('statsmodels: %s' % statsmodels.__version__)
# scikit-learn
import sklearn
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
print('sklearn: %s' % sklearn.__version__)
import datetime

In [None]:
# Import packages for data visualization and exploration

###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################

# Import libraries for data visualization
import itertools
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patches as mpatches
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set()

# Pretty display for notebooks
%matplotlib inline

# Optional global parameter tuning for data visualization 

#from pylab import rcParams
#matplotlib.rcParams['axes.labelsize'] = 14
#matplotlib.rcParams['xtick.labelsize'] = 12
#matplotlib.rcParams['ytick.labelsize'] = 12
#matplotlib.rcParams['text.color'] = 'k'
#rcParams['figure.figsize'] = 18, 8

# Package for fast EDA
import pandas_profiling

## Feature Set Exploration and Data Cleaning

In [None]:
# Load and display the first five rows of the data set
df = pd.read_csv('file_name.csv', header=?, index_col=?, squeeze=?)
df.head()

In [None]:
# Explore the data types and the number of non-null observations in the columns of the dataset.
df.info()

In [None]:
# Explore the basic statistical characteristics of all features:
df.describe(include=['object','int','float'])

In [None]:
# Basic data exploration
pandas_profiling.ProfileReport(df)

### Summary of the feature set:

**BALANCE:** Numeric. Float. Total amount owed to the company. No missing values. Minimum value: 0, maximum value: 19043.13856. Mean: 1564.474828. Right-skewed. 0.9\% of the values are zero.

**BALANCE_FREQ:** Numeric. Float. Conjecture: The percentage of time there is a positive balance in the account. No missing values. Min: 0, Max: 1, Mean: 0.85. Left-skewed. 69.4\% has value 1 and 0.9\% has value 0.

### Missing Data:

In [None]:
# SimpleImputer to replace data by mean, median, most frequent or constant
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit_transform(X)

In [None]:
# Iterative imputer to replace data 
# at each step, a feature column is designated as output y and the other feature columns are treated as inputs X.
#A regressor is fit on (X, y) for known y. Then, the regressor is used to predict the missing values of y. 
#This is done for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds. 
#The results of the final imputation round are returned.
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit_transform(X)

In [None]:
# Fit a polynomial function of a variable to the missing values of another variable

fitting_log=np.polynomial.polynomial.Polynomial.fit(X_ynotnull,y_notnull,
                                         deg=1)
# See the intercept and slope if the linear fit
intercept, slope=fitting_log.convert().coef
print('intercept:',intercept,'slope:',slope)

In [None]:
# Replace the null values of minimum payments by slope*BALANCE+intercept
y_null=slope*X_ynull+intercept    

# Show the numner of missing MINIMUM_PAYMENTS left
y.isnull().sum()

## Exploratory Data Analysis and Visualizations

### Analysis on Numeric Features

In [1]:
# Draw a heatmap of Pearson correlations between all numeric features
def correlation_matrix(dataset):
    numeric_columns=dataset.select_dtypes(include=['int','float']).columns.values.tolist()
    sns.heatmap(dataset[numeric_columns].corr(),cmap='viridis_r',annot=True)

In [2]:
# Fancy
def analyze_numeric_features(dataset,target):
    """
    Identify and draw histograms for numeric features
    """
    # Define the color and font size for the plot below
    base_color = sns.color_palette()[0]
    # Identify numeric features in the data set
    numeric_columns=dataset.select_dtypes(include=['int','float']).columns.values.tolist()
    numeric_column_number=len(numeric_columns) # Number of numeric features
    axis_length=5 # Base axis length for each graph
    total_y_axis_length=axis_length*numeric_column_number
    f, ax = plt.subplots(numeric_column_number,2,figsize=(2.5*axis_length,total_y_axis_length))
    for i, label in enumerate(numeric_columns):
        extra=(dataset[label].max()-dataset[label].min())/14 
        bin_edges=np.arange(dataset[label].min(), np.ceil(dataset[label].max())+extra, extra)
        bin_idxs=pd.cut(dataset[label], bin_edges, include_lowest=True,labels=False).astype(int)
        pts_per_bin=dataset.groupby(bin_idxs).size()
        count_bins=pd.DataFrame(pts_per_bin[bin_idxs],columns=['count'])
        num_var_weights=np.true_divide(target,count_bins['count'])
        sns.distplot(dataset[label], bins=bin_edges, vertical=True, kde=False, hist_kws={'alpha':1}, ax= ax[i][0]) 
        ax[i,0].set_xlabel('Count')
        ax[i,1].hist(x=dataset[label], bins=bin_edges, weights=num_var_weights, orientation='horizontal', color=base_color)
        ax[i,1].set_xlabel('Mean Target') 
    f.tight_layout()
    f.show()   
    return                             


def numeric_scatterplots(dataset):
    """
    Draw scatter plots of all combinations of numeric features
    """
    from itertools import combinations
    base_color = sns.color_palette()[0]
    numeric_columns=dataset.select_dtypes(include=['int','float']).columns.values.tolist()
    numeric_column_number=len(numeric_columns) # Number of numeric features
    axis_length=5 # Base axis length for each graph
    total_y_axis_length=axis_length*len(list(combinations(numeric_columns,2)))
    f, ax = plt.subplots(len(list(combinations(numeric_columns,2))),1,figsize=(1.5*axis_length,total_y_axis_length))
    for i, (x,y) in enumerate(list(combinations(numeric_columns,2))):
        sns.regplot(dataset[x],dataset[y], color=base_color, scatter_kws={'alpha':0.1}, ax=ax[i])   

In [None]:
#Simpler

def analyze_features(dataset):
    """
    Identify and draw histograms for numeric features
    """
    # Define the color and font size for the plot below
    base_color = sns.color_palette()[0]
    # Identify numeric features in the data set
    numeric_columns=dataset.columns
    numeric_column_number=len(numeric_columns) # Number of numeric features
    axis_length=8 # Base axis length for each graph
    total_y_axis_length=axis_length*numeric_column_number
    f, ax = plt.subplots(numeric_column_number,1,figsize=(axis_length,total_y_axis_length))
    for i, label in enumerate(numeric_columns):
        extra=(dataset[label].max()-dataset[label].min())/14 
        bin_edges=np.arange(dataset[label].min(), np.ceil(dataset[label].max())+extra, extra)
        sns.distplot(dataset[label], bins=bin_edges, vertical=True, kde=False, hist_kws={'alpha':1}, ax= ax[i]) 
        ax[i].set_xlabel('Count')
    f.tight_layout()
    f.show()   
    return                

def numeric_scatterplots(dataset):
    """
    Draw scatter plots of all combinations of numeric features
    """
    from itertools import combinations
    base_color = sns.color_palette()[0]
    numeric_columns=dataset.select_dtypes(include=['int','float']).columns.values.tolist()
    numeric_column_number=len(numeric_columns) # Number of numeric features
    axis_length=5 # Base axis length for each graph
    total_y_axis_length=axis_length*len(list(combinations(numeric_columns,2)))
    f, ax = plt.subplots(len(list(combinations(numeric_columns,2))),1,figsize=(1.5*axis_length,total_y_axis_length))
    for i, (x,y) in enumerate(list(combinations(numeric_columns,2))):
        sns.regplot(dataset[x],dataset[y], color=base_color, scatter_kws={'alpha':0.1}, ax=ax[i])   

In [None]:
# Set the parameters for a detailed barchart which shows the categories of a variable sorted by mean target variable
base_color = sns.color_palette()[0]
group_means=country_income.groupby(['native-country']).mean()
order=group_means.sort_values(['income'],ascending=False).index

# Draw the barchart
fig = plt.figure()
fig.set_figheight(12)
fig.set_figwidth(7)
ax=sns.barplot(x=country_income['income'], y=country_income['native-country'], order=order, color=base_color)
ax.set_xlabel('Mean Target Variable (Income=1 if >=50K, 0 otherwise)')

In [3]:
# Visualize skewed continuous features of original and transformed data
from math import ceil
def distribution(data, features,transformed=False):
    """
    Visualization code for displaying skewed distributions of features
    """
    
    # Define the color for the plot
    base_color = sns.color_palette()[0]
    
    # Create figure
    fig = plt.figure(figsize=(11,35))

    # Skewed feature plotting
    for i, feature in enumerate(features):
        ax = fig.add_subplot(8,2,i+1)
        ax.hist(data[feature], bins = 25, color = base_color)
        ax.set_title("%s Distribution"%(feature), fontsize = 14)
        ax.set_xlabel("Value")
        ax.set_ylabel("Number of Records")
        ax.set_ylim((0, 2000))
        ax.set_yticks([0, 500, 1000, 1500, 2000])
        ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])

    # Plot aesthetics
    if transformed:
        fig.suptitle("Log-transformed Distributions of Features", \
            fontsize = 12, y = 1.03)
    else:
        fig.suptitle("Skewed Distributions of Features", \
            fontsize = 12, y = 1.03)

    fig.tight_layout()
    fig.show()

In [None]:
# Log-transform the skewed features
features_log_transformed = credit_card_cleaned.apply(lambda x: np.log(x + 1))

# Visualize the new log distributions
distribution(features_log_transformed, features_log_transformed.columns,transformed = True)

In [None]:
# Normalize or standardize numerical features
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
# scaler = StandardScaler()
numerical = features_raw.select_dtypes(include=['int','float']).columns.values.tolist()

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

# Show 5 examples of records with scaling applied
display(features_log_minmax_transform.head(n = 5))

### Analysis on Categorical Features

In [None]:
def analyze_categorical_features(dataset,target):
    """
    Write a function to build bar charts for all categorical variables as well as an adapted barcharts showing how the mean           income changes for each label of the categorical variables
    """
    # Define the color for the plot
    base_color = sns.color_palette()[0]
    # Identify categorical features in the data set
    cat_columns=dataset.select_dtypes(include=['object']).columns.values.tolist()
    cat_column_number=len(cat_columns) # Number of numeric features
    axis_length=5 # Base axis length for each graph
    total_y_axis_length=axis_length*cat_column_number
    f, ax = plt.subplots(cat_column_number,2,figsize=(2.5*axis_length,total_y_axis_length))
    for i, label in enumerate(cat_columns):
        # Get the frequency order from high to low frequency for nominal variables 
        order= dataset[label].value_counts().index
        sns.countplot(data=dataset, y=label, order=order, color=base_color, ax=ax[i][0])
        sns.barplot(x=target, y=dataset[label], order=order, color=base_color, ax=ax[i][1])
        ax[i,1].set_xlabel('Mean Income (Income=1 if >=50K, 0 otherwise)')
    # Change the absolute frequency bar charts to relative frequency
    n_points=dataset.shape[0]
    j=0
    for i in range(len(cat_columns)):   
        xlimit=ax[i,j].get_xlim()[1]
        limit=xlimit/n_points

        # Generate tick mark locations and names
        tick_props = np.arange(0, limit+0.3, 0.2)
        tick_names = ['{:.1f}'.format(v) for v in tick_props]
        ax[i,j].set_xticks(tick_props*n_points)
        ax[i,j].set_xticklabels(tick_names)
        ax[i,j].set_xlabel('Frequency')
        
        # Add annotations
        ax[i,j].get_ylabel()
        cat_counts = dataset[ax[i,j].get_ylabel()].value_counts()
        max_count = dataset[ax[i,j].get_ylabel()].value_counts().max()
        locs= ax[i,j].get_yticks() # get the current tick locations and labels
        labels=ax[i,j].get_yticklabels()
        
        # Loop through each pair of locations and labels
        for loc, label in zip(locs, labels):
            # Get the text property for the label to get the correct count
            count = cat_counts[label.get_text()]
            pct_string = '{:0.1f}%'.format(100*count/n_points)
            # Print the annotation just below the top of the bar
            ax[i,j].text(count+(limit+0.25)*n_points*0.1, loc, pct_string, ha = 'center', color = 'black')

    f.tight_layout()
    f.show() 
    return

In [None]:
# One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
features_final = pd.get_dummies(features_log_minmax_transform)

# Encode the 'income_raw' data to numerical values
income = (income_raw=='>50K').astype(int)

# Print the number of features after one-hot encoding
encoded = features_final.columns.values.tolist()
print("{} total features after one-hot encoding.".format(len(encoded)))

print(encoded)

## Train-Test Split

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    income, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

## Evaluate Model Performance

In [4]:
# Naive predictor performance with all cases positive
TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data encoded to numerical values done in the data preprocessing step.
FP = income.count() - TP # Specific to the naive case

TN = 0 # No predicted negatives in the naive case
FN = 0 # No predicted negatives in the naive case

# TODO: Calculate accuracy, precision and recall
accuracy = (TP+TN)/(TP+FP+TN+FN)
recall = TP/(TP+FN)
precision = TP/(TP+FP)

# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
beta=0.5
fscore =(1+beta**2)*precision*recall/((beta**2*precision)+recall) 

# Print the results 
print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

NameError: name 'np' is not defined

In [5]:
# Training and prediction pipeline
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    
    
    end = time() # Get end time
    
    # Calculate the training time
    results['train_time'] = end-start
        
    # Get the predictions on the test set(X_test),
    # then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
    
    # Calculate the total prediction time
    results['pred_time'] = end-start
            
    # Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
        
    # Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # Compute F-score on the the first 300 training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta=0.5)
        
    # Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [6]:
# Run the models

# Import the three supervised learning models from sklearn
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Initialize the three models
clf_A = xgb.XGBClassifier(random_state=0)
clf_B = RandomForestClassifier(random_state=0)
clf_C = LogisticRegression(random_state=0)

# Calculate the number of samples for 1%, 10%, and 100% of the training data
samples_100 = len(y_train)
samples_10 = int(len(y_train)/10)
samples_1 = int(len(y_train)/100)

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

NameError: name 'y_train' is not defined

In [7]:
# Visualization function for comparing the evaluation metrics of three different models

def evaluate(results, accuracy, f1):
    """
    Visualization code to display results of various learners.
    
    inputs:
      - learners: a list of supervised learners
      - stats: a list of dictionaries of the statistic results from 'train_predict()'
      - accuracy: The score for the naive predictor
      - f1: The score for the naive predictor
    """
  
   
    # Define the color and font size for the plot below
    base_color1 = sns.color_palette()[0]
    base_color2 = sns.color_palette()[1]
    base_color3 = sns.color_palette()[2]
    matplotlib.rcParams.update({'font.size': 12})
    SMALL_SIZE = 10
    MEDIUM_SIZE = 12
    BIGGER_SIZE = 14

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=SMALL_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

    # Create figure
    fig, ax = plt.subplots(2, 3, figsize = (11,7))

    # Constants
    bar_width = 0.3
    colors = [base_color1, base_color2,base_color3]
    
    # Super loop to plot four panels of data
    for k, learner in enumerate(results.keys()):
        for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):
            for i in np.arange(3):
                
                # Create plot code
                ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
                ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45])
                ax[j//3, j%3].set_xticklabels(["Sample: 1%", "10%", "100%"])
                ax[j//3, j%3].set_xlim((-0.1, 3.0))
    
    # Add unique y-labels
    ax[0, 0].set_ylabel("Time (in seconds)")
    ax[0, 1].set_ylabel("Accuracy Score")
    ax[0, 2].set_ylabel("F-score")
    ax[1, 0].set_ylabel("Time (in seconds)")
    ax[1, 1].set_ylabel("Accuracy Score")
    ax[1, 2].set_ylabel("F-score")
    
    # Add horizontal lines for naive predictors
    ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    
    # Set y-limits for score panels
    ax[0, 1].set_ylim((0, 1))
    ax[0, 2].set_ylim((0, 1))
    ax[1, 1].set_ylim((0, 1))
    ax[1, 2].set_ylim((0, 1))
    
    #Add titles
    ax[0, 0].set_title("Model Training")
    ax[0, 1].set_title("Accuracy Score on Training Subset")
    ax[0, 2].set_title("F-score on Training Subset")
    ax[1, 0].set_title("Model Predicting")
    ax[1, 1].set_title("Accuracy Score on Testing Set")
    ax[1, 2].set_title("F-score on Testing Set")

    # Create patches for the legend
    patches = []
    for i, learner in enumerate(results.keys()):
        patches.append(mpatches.Patch(color = colors[i], label = learner))
    plt.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
               loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
    
    # Aesthetics
    plt.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
    plt.tight_layout()
    plt.show()    

In [None]:
# Hyperparameter-tuning for the best model

# Initialize the classifier
clf = xgb.XGBClassifier(random_state=0)

# Create the parameters list you wish to tune, using a dictionary if needed.
parameters = {'max_depth': [1,2,3], 'learning_rate': [0.1,0.3,1], 'n_estimators':[300, 400, 500]}

# Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(fbeta_score, beta=0.5)

# Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(clf, parameters, scoring=scorer,cv=3)

# Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

### Feature Importance

In [None]:
# Define a function to show the feature importances of features given a model and feature importances of that model
def feature_plot(model, importances, X, num_features):
    # Display the features in the order of importance
    indices = np.argsort(importances)[:num_features]
    columns = X.columns.values[indices[:]]
    values = importances[indices][:]

    # Define the colors for the plot
    base_color1 = sns.color_palette()[0]

    # Create the plot
    fig = plt.figure(figsize = (8,6))
    plt.title("Feature Importances ({})".format(model.__class__.__name__), fontsize = 16)
    plt.barh(np.arange(num_features), values, align="center", color = base_color1)
    plt.yticks(np.arange(num_features), columns, fontsize = 14)
    plt.xlabel("Normalized Weight", fontsize = 14, fontweight='bold')
    plt.ylabel("Features", fontsize = 14, fontweight='bold')
    #ax.set_yticklabels({'t-1':'# of calls one day ago','t-2':'# of calls two days ago','t-3':'# of calls three days ago','t-4':'# of calls four days ago', 't-5':'# of calls five days ago','t-6':'# of calls six days ago','t-7':'# of calls one week ago',\
    #                'observed_high':'highest temperature','observed_low':'lowest temperature'})
    plt.tight_layout()
    plt.show()  

In [None]:
# Import a supervised learning model that has 'feature_importances_'
# Will use three supervised learning models to compare the most important features 
# Already imported xgboost to use the supervised learning algorithm XGBClassifier for feature importance
# Already imported random forest classifier 
# Import adaboost classifier
from sklearn.ensemble import AdaBoostClassifier

# Import functionality for cloning a model
from sklearn.base import clone

# Train the supervised model on the training set using .fit(X_train, y_train)
model1 = (clone(best_clf)).fit(X_train, y_train) # Use the clone of best_clf computed above using xgboost classifier
model2=RandomForestClassifier(random_state=0)
model2.fit(X_train, y_train) 
model3=AdaBoostClassifier(random_state=0)
model3.fit(X_train, y_train) 

for model in [model1, model2, model3]:
    # Extract the feature importances using .feature_importances_ 
    importances = model.feature_importances_ 
    # Plot feature importances for all features
    feature_plot(model, importances, X, num_features=X.shape[1])