In [None]:
#Correlation Matrix in Plotly

import plotly.plotly as py
import plotly.graph_objs as go

def make_correlation_matrix(df = df_churn):   
    '''
    produces a plotly correlation matrix
    
    INPUT: dataframe
    
	NOTE: you need to store the output of this function in a cariable, 
	and then put that in a new line to see the plot.
	like so:

	out = make_correlation_matrix(df_churn_scaled)
	out
	
    '''
    correlations = df.corr()
    matrix_columns = correlations.columns.tolist()
    corr_array  = np.array(correlations)

    
    heat = go.Heatmap(z = corr_array,
                        x = matrix_columns,
                        y = matrix_columns,
                        colorscale = "Jet",
                        colorbar   = dict(title = "Pearson correlation coefficient",
                                         titleside = "right"
                                        ) ,
                      )

    layout = go.Layout(dict(title = "Correlation Matrix",
                            
                            height  = 600,
                            width   = 800,
                            margin  = dict(r = 0 ,l = 200,
                                           t = 25,b = 200,
                                          ),
                            yaxis   = dict(tickfont = dict(size = 10)),
                            xaxis   = dict(tickfont = dict(size = 10))
                           )
                      )

    fig = go.Figure(data= [heat],layout=layout)
    return py.iplot(fig, filename = 'correlation-matrix')

In [None]:
# optimizing threshold
probabilities = rfc_smote.predict_proba(X_test_st)[:,1]
reals = y_test
best_cost = -10**10
best_thresh = 0

for threshold in np.linspace(0,1,101):
    predictions = probabilities >= threshold
    tp = sum((reals == 1) & (predictions == 1))
    fp = sum((reals == 0) & (predictions == 1))
    tn = sum((reals == 0) & (predictions == 0))
    fn = sum((reals == 1) & (predictions == 0))
    tp_c = 2
    fp_c = -6
    tn_c = 0
    fn_c = -1
    cost = tp*tp_c + fp*fp_c + tn*tn_c + fn*fn_c
    if cost > best_cost:
        best_cost = cost
        best_thresh = threshold
    #print(f'cost is {cost} and threshold is {threshold}')
    #print(f'tp is {tp}, fp is {fp}, tn is {tn}, fn is {fn}')
        
print(f'Optimal threshold is: {best_thresh}')

In [None]:
# Run CV with 5 folds (knn)

ks = [501]
param_grid = [{'n_neighbors': ks}]

knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc', verbose=30, n_jobs=-1)
knn_grid.fit(X_train_under, y_train_under)

In [None]:
#R formula function


def ols_formula(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

In [None]:
#To keep validation set apart from the test set in order to oversample/undersample your test set during cross validation perform the following:
random_seed = 123
kf = StratifiedKFold(n_splits=5, random_state=random_seed)
cross_val_f1_score_lst = []
cross_val_accuracy_lst = []
cross_val_recall_lst = []
cross_val_precision_lst = []

for train_index_ls, validation_index_ls in kf.split(x_train, y_train):
    # keeping validation set apart and oversampling in each iteration using smote 
    train, validation = x_train.iloc[train_index_ls], x_train.iloc[validation_index_ls]
    target_train, target_val = y_train.iloc[train_index_ls], y_train.iloc[validation_index_ls]
    sm = SMOTE(random_state=random_seed)
    X_train_res, y_train_res = sm.fit_sample(train, target_train)
    print (X_train_res.shape, y_train_res.shape)
    
    # training the model on oversampled 4 folds of training set
    rf = RandomForestClassifier(n_estimators=5, random_state=random_seed)
    rf.fit(X_train_res, y_train_res)
    
    # testing on 1 fold of validation set
    validation_preds = rf.predict(validation)
    cross_val_recall_lst.append(recall_score(target_val, validation_preds))
    cross_val_accuracy_lst.append(accuracy_score(target_val, validation_preds))
    cross_val_precision_lst.append(precision_score(target_val, validation_preds))
    cross_val_f1_score_lst.append(f1_score(target_val, validation_preds))
    
print ('Cross validated accuracy: {}'.format(np.mean(cross_val_accuracy_lst)))
print ('Cross validated recall score: {}'.format(np.mean(cross_val_recall_lst)))
print ('Cross validated precision score: {}'.format(np.mean(cross_val_precision_lst)))
print ('Cross validated f1_score: {}'.format(np.mean(cross_val_f1_score_lst)))   
    

In [None]:



///
X_cat_train = X_train.select_dtypes(include='object')
X_cat_train = pd.DataFrame(X_cat_train,dtype ='str')
imp_freq_train = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_cat_imputed_train = imp_freq_train.fit_transform(X_cat_train)
X_cat_imputed_train = pd.DataFrame(data = X_cat_imputed_train)

OE = OneHotEncoder(handle_unknown='ignore')
OE.fit(X_cat_imputed_train)
X_cat_imputed_train = OE.transform(X_cat_imputed_train)
names= OE.get_feature_names()[:]
names = names.tolist()
X_cat_imputed_train = pd.DataFrame(data = X_cat_imputed_train.toarray(), columns = names)




X_num_train = X_train.select_dtypes(include='float64')
imp_freq_train = SimpleImputer(missing_values=np.nan, strategy='median')
X_num_imputed_train = imp_freq_train.fit_transform(X_num_train)
X_num_imputed_train = pd.DataFrame(data= X_num_imputed_train, columns = X_num_train.columns)

scale = StandardScaler()
scale.fit(X_num_imputed_train)
X_num_imputed_train = scale.transform(X_num_imputed_train)
X_num_imputed_train = pd.DataFrame(data= X_num_imputed_train, columns = X_num_train.columns)


X_train_processed = pd.concat([X_cat_imputed_train,X_num_imputed_train],axis =1)

y_train =pd.Series(y_train, dtype ='int')
///

In [None]:
#Plotting K-Means inertia stuff

plt.plot([KMeans(n_clusters=i).fit(X).inertia_ for i in range(1,10)])

inertia_list = list()
img_list = list()
for i in range(2,20):
    num_clusters = i
    im2,km_inertia = image_cluster(img,i)
    inertia_list.append(km_inertia)
    img_list.append(im2)
plt.plot(range(2,20),inertia_list)
plt.xlabel('Number of Clusters')
plt.ylabel('K-means Inertia')

def image_cluster(img,k):
    ### Write code here
    #img = plt.imread(img)
    img_flat = img.reshape(img.shape[0]*img.shape[1],3)
    kmeans = KMeans(n_clusters=k, random_state=0).fit(img_flat)
    img_flat2 = img_flat.copy()
    # loops for each cluster center
    for i in np.unique(kmeans.labels_):
        img_flat2[kmeans.labels_==i,:] = kmeans.cluster_centers_[i]
    img2 = img_flat2.reshape(img.shape)
    plt.imshow(img2)
    plt.axis('off');
    return img2, kmeans.inertia_

In [None]:
#making a decision tree graph:
clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=3)
clf = clf.fit(X_train_under, y_train_under)

dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("Fraud") 

dot_data = tree.export_graphviz(clf, out_file=None, 
                      feature_names=features,  
                      class_names=['NotFraud','isFraud'], 
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

In [None]:
#ROC curve comparison graph


def plot_comparison_roc(model_dict, test_x = val_X , test_y = val_Y, ):
    
    '''
    INPUT: dict of trained models, test df
    
    plots the overlaying roc curves  for comparison purposes.     
    '''
    
    colors = [
    'blue',
    'green',
    'red',
    'cyan',
    'magenta',
    'yellow',
    'black',
    'violet',
    ]
    
    i = 0
    
    fig = plt.figure(figsize=(14,14))
    
    for key, model in model_dict.items():
        color = colors[i]
        i += 1
        i = i % len(colors)
        predictions   = model.predict(test_x)  
        probabilities = model.predict_proba(test_x)
        fpr, tpr, thresholds = roc_curve(test_y, probabilities[:,1])
        plt.plot(fpr, tpr, lw = 2, label= key, c = color)
        plt.xlim([-0.05,1.05])
        plt.ylim([-0.05,1.05])
    
    plt.plot([0,1],[0,1], c = color, ls='--')
    plt.xlabel('False Positive Rate',fontsize = 20)
    plt.ylabel('True Positive Rate', fontsize = 20)
    plt.title('ROC Curves', fontsize = 20)
    plt.legend(loc="lower right", fontsize = 20)
    plt.tight_layout()
    
    plt.show()

In [None]:
#interpret my Logit coefficients.

# Put your beta here
beta = 0.5

# Don't change this
my_list = [i*beta for i in range(0,10)]

# Calculating odds
odds = [np.exp(i) for i in my_list]

# Plotting odds
plt.plot(odds)

# Interpretation
print('For every 1 unit change in my feature, the odds go up by %.2f' % ((odds[1]/odds[0]-1)*100), '%')