# Goal of this notebook

In this notebook, we use techniques for dimensionality reduction to arrive at a smaller feature set.


## Import packages and read data

In [1]:
#%reset -fs

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import folium
#from folium.plugins import MarkerCluster



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate

#import plotly.graph_object as go

%matplotlib inline 
plt.style.use("ggplot")

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("f_chtr_churn_traintable_nf_v2.csv")
df.shape

(209043, 171)

# Data cleaning

## Make cut-off for number of subscriptions per subscriber

The original data set contains many subscribers with very high numbers of subscriptions (up to 7000 subscriptions per subscriber). We limit ourselves to subscribers with at most four subscriptions (corresponding to cnt_abo <4, because cnt_abo counts additional subscriptions):

In [3]:
df2=df[df.cnt_abo < 4]
df2.shape

(175130, 171)

## Missing values


The following variables have missing values:
    
    * ort
    * email_am_kunden
    * kuendigungs_eingangs_datum



In [4]:
# Checking for missing values
missing = pd.DataFrame(df2.isnull().sum(), columns=["Amount"])
missing['Percentage'] = round((missing['Amount']/df2.shape[0])*100, 2)
missing[missing['Amount'] != 0]

Unnamed: 0,Amount,Percentage
ort,85,0.05
email_am_kunden,12,0.01
kuendigungs_eingangs_datum,122166,69.76


Only kuendigungs_eingangs_datum is irrelevant for model building, because it would tell us who would churn. 
kuendigungs_eingangs_datum might be interesting for EDA. In any case, there is no need to replace missing 
values for that variable as we will later drop it for model building. ort and email_am_kunden have very few missing values, so we can simply drop the 
corresponding rows.    

In [5]:
df2 = df2.dropna(subset=['ort','email_am_kunden'])
df2=df2.reset_index()
df2.head()

Unnamed: 0.1,index,Unnamed: 0,auftrag_new_id,liefer_beginn_evt,kanal,objekt_name,aboform_name,zahlung_rhythmus_name,lesedauer,rechnungsmonat,...,openrate_zeitbrief_1w,clickrate_zeitbrief_1w,openrate_zeitbrief_1m,clickrate_zeitbrief_1m,openrate_zeitbrief_3m,clickrate_zeitbrief_3m,training_set,kuendigungs_eingangs_datum,churn,date_x
0,3,3,25B535B7-D3F9-4804-9FFE-A8C813C8A593,2013-10-17,andere,ZEIT Digital,Festabo,jährlich,72,0,...,1.0,0.0,0.67,0.0,0.36,0.0,1,,0,2019-10-02 00:00:00
1,4,4,C348C873-8229-4F4F-8B50-5B4C06B2E872,2013-09-12,andere,ZEIT Digital,Festabo,vierteljährlich,70,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,,0,2019-07-30 00:00:00
2,5,5,15FA7CDA-FDCE-4AB9-A427-0FF8B2D9940F,2013-11-14,andere,ZEIT Digital,Probeabo,jährlich,75,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,,0,2020-02-26 00:00:00
3,6,6,894FAFB7-6D8C-43CC-82DB-246E0AD1F662,2013-11-14,andere,ZEIT Digital,Probeabo,jährlich,78,0,...,1.0,0.0,0.75,0.0,0.69,0.0,1,,0,2020-05-18 00:00:00
4,7,7,1862C64E-3133-4AD0-B144-E7D8ED69772C,2014-01-02,andere,ZEIT Digital,Probeabo,jährlich,69,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,,0,2019-10-31 00:00:00


Check that there are no more missing values except for kuendigungs_eingangs_datum:

In [6]:
df2.loc[:,df2.isnull().any()].columns

Index(['kuendigungs_eingangs_datum'], dtype='object')

The old index now appears as another irrelevant feature (index), which we will now drop together with a few other irrelevant or useless features.

## Dropping trivial or irrelevant variables

We drop the following variables:
    
    * index (no useful information)
    * Unnamed: 0 (no useful information)
    * auftrag_new_id (no useful information)
    * kuendigungs_eingangs_datum (would tell us who would churn, because those who don't churn have nan here)
    * training_set (no useful information)
    * date_x 
                                  

                                  
                                  


In [7]:
drop=['index', 'Unnamed: 0', 'auftrag_new_id',
      'training_set','kuendigungs_eingangs_datum', 'date_x']
df2=df2.drop(drop, axis=1)

## Transforming temporal variables

For the EDA, we have to convert the remaining temporal variables that are not yet numbers to suitable numbers:

    * liefer_beginn_evt
    * abo_registrierung_min
    * nl_registrierung_min



In [8]:
df2['liefer_beginn_num']= df2['liefer_beginn_evt'].str[:4].astype('int')  +df2['liefer_beginn_evt'].str[5:7].astype('float')/12 #+ df2['liefer_beginn_evt'].str[8:10].astype('float')/365 
df2['abo_registrierung_min_year']= df2['abo_registrierung_min'].str[:4].astype('int')  
df2['nl_registrierung_min_year']= df2['nl_registrierung_min'].str[:4].astype('int') 

df2=df2.drop(['liefer_beginn_evt','abo_registrierung_min','nl_registrierung_min'
             ], axis=1)
df2[['liefer_beginn_num', 'abo_registrierung_min_year', 'nl_registrierung_min_year'
    ]].head()


Unnamed: 0,liefer_beginn_num,abo_registrierung_min_year,nl_registrierung_min_year
0,2013.833333,2013,2013
1,2013.75,1900,2013
2,2013.916667,2013,2013
3,2013.916667,2013,2013
4,2014.083333,2013,2008


# Exploratory data analysis

In the following, we will plot the churn probability by value of selected features. To this end, 
we introduce a dummy variable for the two possible churn values 0 and 1 to facilitate the counting. As we will use this only for EDA, we will
define a new data frame df_a. df2 will later be used for model building again. 

In [9]:

dummy0 = pd.get_dummies(df2['churn'],prefix='churn')

In [10]:

df_a = pd.concat([df2,dummy0], axis =1)

We first determine the overall churn probability in the entire data set:

In [11]:
df_a['churn_1'].sum()/(df_a['churn_1'].sum() + df_a['churn_0'].sum())

0.30223656773972407

The total churn probability in the entire dataset is thus:

**30.2%**

# Data preprocessing

## Binning of abo_registrierung_min_year and nl_registrierung_min_year

The variables abo_registrierung_min_year and nl_registrierung_min_year should be turned into  dummy variables as 
they contain the artificial year 1900. abo_registrierung_min_year, however, contains so many years that it should 
be binned. We choose 13 bins such that the classes have somewhat comparable size: 

In [12]:
df2['abo_registrierung_min_year_binned']= df2['abo_registrierung_min_year'].apply(
    lambda x: 0 if x==1900 else x).apply(
    lambda x:1 if 1900 < x <=1996 else x).apply(
    lambda x: 2 if 1996 < x <= 2002 else x).apply(
    lambda x: 3 if 2002 < x <= 2007 else x).apply(
    lambda x: 4 if 2007 < x <=2010 else x).apply(
    lambda x: 5 if 2010 < x <= 2012 else x ).apply(
    lambda x : x-2007 if x>2012 else x)

In [13]:
df2['abo_registrierung_min_year_binned'].value_counts()

0     23630
11    22273
10    18574
9     14753
6     13488
8     12783
7     12322
5     11193
4     10638
3     10388
1      9148
12     8838
2      7017
Name: abo_registrierung_min_year_binned, dtype: int64

For nl_registrierung_min_year, we choose 12 bins:

In [14]:
df2['nl_registrierung_min_year_binned']= df2['nl_registrierung_min_year'].apply(
    lambda x: 0 if x==1900 else x).apply(
    lambda x:1 if 1900 < x <=2005 else x).apply(
    lambda x: 2 if 2005 < x <= 2009 else x).apply(
    lambda x: 3 if 2009 < x <= 2011 else x).apply(
    lambda x : x-2008 if x>=2012 else x)

In [15]:
df2['nl_registrierung_min_year_binned'].value_counts()

9     21312
10    19653
2     17433
3     16022
8     14774
1     14030
7     14028
5     13859
6     13684
0     11808
4     10174
11     7567
12      701
Name: nl_registrierung_min_year_binned, dtype: int64

We will drop the unbinned variables abo_registrierung_min_year and nl_registrierung_min_year in the next subsection.

## Dropping ort, abo_registrierung_min_year, nl_registrierung_min_year


Before we introduce dummy variables for the categorical features, we drop

    * ort 
    
    and the unbinned variables
    
    * abo_registrierung_min_year
    * nl_registrierung_min_year

In [16]:
df2=df2.drop(['ort','abo_registrierung_min_year', 'nl_registrierung_min_year'], axis=1)

## Categorical and numerical features

**Categorical features**

Categorical features are the ones of type 'object' 
and several variables that have numerical values, but are nevertheless categorical:

In [19]:
# Creating list for categorical predictors/features 
cat_features = list(df2.columns[df2.dtypes==object])
# Extend the list by those categorical features that are already given in terms of numbers
cat_features.extend(['rechnungsmonat','metropole','studentenabo','unterbrechung','email_am_kunden',
       'zon_che_opt_in', 'zon_sit_opt_in', 'zon_zp_grey', 'zon_premium',
       'zon_boa', 'zon_kommentar', 'zon_sonstige', 'zon_zp_red',
       'zon_rawr', 'zon_community', 'zon_app_sonstige', 'zon_schach',
       'zon_blog_kommentare', 'zon_quiz','nl_zeitbrief', 'nl_zeitshop', 'nl_zeitverlag_hamburg',
       'nl_fdz_organisch','boa_reg', 'che_reg', 'sit_reg', 'sso_reg','abo_registrierung_min_year_binned', 
                     'nl_registrierung_min_year_binned',
                    ])
cat_features

['kanal',
 'objekt_name',
 'aboform_name',
 'zahlung_rhythmus_name',
 'zahlung_weg_name',
 'plz_1',
 'plz_2',
 'plz_3',
 'land_iso_code',
 'anrede',
 'titel',
 'rechnungsmonat',
 'metropole',
 'studentenabo',
 'unterbrechung',
 'email_am_kunden',
 'zon_che_opt_in',
 'zon_sit_opt_in',
 'zon_zp_grey',
 'zon_premium',
 'zon_boa',
 'zon_kommentar',
 'zon_sonstige',
 'zon_zp_red',
 'zon_rawr',
 'zon_community',
 'zon_app_sonstige',
 'zon_schach',
 'zon_blog_kommentare',
 'zon_quiz',
 'nl_zeitbrief',
 'nl_zeitshop',
 'nl_zeitverlag_hamburg',
 'nl_fdz_organisch',
 'boa_reg',
 'che_reg',
 'sit_reg',
 'sso_reg',
 'abo_registrierung_min_year_binned',
 'nl_registrierung_min_year_binned']

In [20]:
# Creating list for numerical features
num_features = list(df2.columns)
for i in cat_features:
    num_features.remove(i)
num_features.remove('churn')    

num_features

['lesedauer',
 'shop_kauf',
 'avg_churn',
 'cnt_abo',
 'cnt_abo_diezeit',
 'cnt_abo_diezeit_digital',
 'cnt_abo_magazin',
 'cnt_umwandlungsstatus2_dkey',
 'nl_blacklist_sum',
 'nl_bounced_sum',
 'nl_aktivitaet',
 'nl_sperrliste_sum',
 'nl_opt_in_sum',
 'received_anzahl_1w',
 'received_anzahl_1m',
 'received_anzahl_3m',
 'received_anzahl_6m',
 'opened_anzahl_1w',
 'opened_anzahl_1m',
 'opened_anzahl_3m',
 'openedanzahl_6m',
 'clicked_anzahl_1w',
 'clicked_anzahl_1m',
 'clicked_anzahl_3m',
 'clicked_anzahl_6m',
 'unsubscribed_anzahl_1w',
 'unsubscribed_anzahl_1m',
 'unsubscribed_anzahl_3m',
 'unsubscribed_anzahl_6m',
 'openrate_1w',
 'clickrate_1w',
 'openrate_1m',
 'clickrate_1m',
 'openrate_3m',
 'clickrate_3m',
 'received_anzahl_bestandskunden_1w',
 'received_anzahl_bestandskunden_1m',
 'received_anzahl_bestandskunden_3m',
 'received_anzahl_bestandskunden_6m',
 'opened_anzahl_bestandskunden_1w',
 'opened_anzahl_bestandskunden_1m',
 'opened_anzahl_bestandskunden_3m',
 'openedanzahl_bes

We do not the pipeline method for creating the dummies, as we would like to maintain maximum flexibility 
regarding the selection of features:

In [21]:
df2_d = df2
df2_d=pd.get_dummies(df2_d,columns=cat_features, drop_first=True)
df2_d.head()

Unnamed: 0,lesedauer,shop_kauf,avg_churn,cnt_abo,cnt_abo_diezeit,cnt_abo_diezeit_digital,cnt_abo_magazin,cnt_umwandlungsstatus2_dkey,nl_blacklist_sum,nl_bounced_sum,...,nl_registrierung_min_year_binned_3,nl_registrierung_min_year_binned_4,nl_registrierung_min_year_binned_5,nl_registrierung_min_year_binned_6,nl_registrierung_min_year_binned_7,nl_registrierung_min_year_binned_8,nl_registrierung_min_year_binned_9,nl_registrierung_min_year_binned_10,nl_registrierung_min_year_binned_11,nl_registrierung_min_year_binned_12
0,72,0,0.172959,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,70,0,0.309526,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,75,0,0.127118,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,78,0,0.11484,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,69,0,0.172727,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Splitting into target and predictor variables and test, validation and training set

In [22]:
# Define predictors and target variable
X = df2_d.drop('churn', axis=1)
y = df2_d['churn']
print(X.shape)
print(y.shape)

(175045, 1025)
(175045,)


In [37]:
# Split into train/cross-validation and test set
X_train_cv, X_test, y_train_cv, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=7)

print(X_train_cv.shape)
print(y_train_cv.shape)
print(X_test.shape)
print(y_test.shape)

(140036, 1025)
(140036,)
(35009, 1025)
(35009,)


In [38]:
# Split into train/cross-validation and test set
X_train, X_val, y_train, y_val = train_test_split(X_train_cv, y_train_cv, stratify = y_train_cv, 
                                                  test_size=0.2, random_state=7)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(112028, 1025)
(112028,)
(28008, 1025)
(28008,)


## Scaling of numerical features

In [39]:
#scaler = StandardScaler()
scaler = MinMaxScaler()
scaler.fit(X_train[num_features])
X_train_scaled = X_train
X_train_scaled[num_features] = scaler.transform(X_train_scaled[num_features])
X_val_scaled = X_val
X_val_scaled[num_features] = scaler.transform(X_val_scaled[num_features])
X_test_scaled = X_test
X_test_scaled[num_features] = scaler.transform(X_test_scaled[num_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#in

# Dimensionality reduction

## PCA

Let us check PCA with 20, 30 and eight components for KNN on the test set.

In [25]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)

X_train_scaled_pca = pca.fit_transform(X_train_scaled)



In [27]:
X_train_scaled_pca

array([[-0.53128076,  0.27885545,  0.3604438 , ...,  0.24729976,
         0.4571993 , -0.33728104],
       [ 0.37088207, -0.37923026,  0.1179255 , ..., -0.00800846,
         0.14317601,  0.06699873],
       [ 0.0733114 , -0.95018005,  0.96094674, ...,  0.59764105,
        -0.51729062, -0.13995804],
       ...,
       [-0.19722697, -0.47027309,  0.44747692, ..., -0.0351068 ,
         0.33689426,  0.25160384],
       [ 0.7692886 , -0.64262257, -0.52493326, ..., -0.76997481,
         0.20511885, -0.39238877],
       [-0.78312083,  0.29267118, -0.04455118, ...,  1.25715337,
         0.16043062, -0.45047302]])

In [28]:
X_test_scaled_pca=pca.transform(X_test_scaled)

In [29]:
knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#XGBClassifier(random_state=7, scale_pos_weight=0.9, max_depth=19, min_child_weight=2, gamma=0, 
     #                subsample=1, colsample_bytree=0.5, reg_alpha= 0.0001)
knn_clf.fit(X_train_scaled_pca,y_train)
y_pred= knn_clf.predict(X_test_scaled_pca)
y_prob = knn_clf.predict_proba(X_test_scaled_pca)[:,1]


cm = metrics.confusion_matrix(y_test,y_pred)
print(cm)
print(f"Accuracy: {accuracy_score(y_test, y_pred) }")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"ROC_AUC: {roc_auc_score(y_test, y_prob)}")
print(f"AP: {average_precision_score(y_test,y_prob)}")
print(f"f1: {f1_score(y_test,y_pred)}")
print(f"fbeta: {fbeta_score(y_test,y_pred,beta=0.5)}")


[[22567  1861]
 [ 5096  5485]]
Accuracy: 0.8012796709417578
Precision: 0.7466648516199292
Recall: 0.5183820054815235
ROC_AUC: 0.8427579739301487
AP: 0.7475548562409314
f1: 0.6119261449210688
fbeta: 0.6862254472663581


In [31]:
pca30 = PCA(n_components=30)

X_train_scaled_pca_30 = pca30.fit_transform(X_train_scaled)

In [32]:
X_test_scaled_pca_30=pca30.transform(X_test_scaled)

In [33]:
knn_clf30 = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#XGBClassifier(random_state=7, scale_pos_weight=0.9, max_depth=19, min_child_weight=2, gamma=0, 
     #                subsample=1, colsample_bytree=0.5, reg_alpha= 0.0001)
knn_clf30.fit(X_train_scaled_pca_30,y_train)
y_pred= knn_clf30.predict(X_test_scaled_pca_30)
y_prob = knn_clf30.predict_proba(X_test_scaled_pca_30)[:,1]


cm = metrics.confusion_matrix(y_test,y_pred)
print(cm)
print(f"Accuracy: {accuracy_score(y_test, y_pred) }")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"ROC_AUC: {roc_auc_score(y_test, y_prob)}")
print(f"AP: {average_precision_score(y_test,y_prob)}")
print(f"f1: {f1_score(y_test,y_pred)}")
print(f"fbeta: {fbeta_score(y_test,y_pred,beta=0.5)}")


[[22602  1826]
 [ 5116  5465]]
Accuracy: 0.8017081321945785
Precision: 0.7495542449595392
Recall: 0.5164918249692846
ROC_AUC: 0.8467889668705706
AP: 0.7534061380662732
f1: 0.6115711727842434
fbeta: 0.6875078626242295


In [34]:
pca8 = PCA(n_components=8)

X_train_scaled_pca_8 = pca8.fit_transform(X_train_scaled)

In [35]:
X_test_scaled_pca_8=pca8.transform(X_test_scaled)

In [36]:
knn_clf8 = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#XGBClassifier(random_state=7, scale_pos_weight=0.9, max_depth=19, min_child_weight=2, gamma=0, 
     #                subsample=1, colsample_bytree=0.5, reg_alpha= 0.0001)
knn_clf8.fit(X_train_scaled_pca_8,y_train)
y_pred= knn_clf8.predict(X_test_scaled_pca_8)
y_prob = knn_clf8.predict_proba(X_test_scaled_pca_8)[:,1]


cm = metrics.confusion_matrix(y_test,y_pred)
print(cm)
print(f"Accuracy: {accuracy_score(y_test, y_pred) }")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"ROC_AUC: {roc_auc_score(y_test, y_prob)}")
print(f"AP: {average_precision_score(y_test,y_prob)}")
print(f"f1: {f1_score(y_test,y_pred)}")
print(f"fbeta: {fbeta_score(y_test,y_pred,beta=0.5)}")

[[22572  1856]
 [ 5019  5562]]
Accuracy: 0.8036219257905111
Precision: 0.7497977891614991
Recall: 0.5256592004536433
ROC_AUC: 0.8486316007694863
AP: 0.7575440526164218
f1: 0.6180343352408467
fbeta: 0.6908801828435148


The values are comparable to or even slightly better than in our standard analysis in Capstone_main. In particular, 
the values for eight components are surprisingly good. 
So it is worth to study PCA more systematically in connection with our best models KNN, RFC and XGB. We will do this on the 
validation set.

**K nearest neighbors optimized for accuracy and the Fbeta score**

In [112]:
components = {1:[1,2,3,4,5,6,7], 2:[1,2,3,4,5,6,7],3:[1,2,3,4,5,6,7],4:[1,2,3,4,5,6,7],5:[1,2,3,4,5,6,7],
              6:[1,2,3,4,5,6,7],8:[1,2,3,4,5,6,7],10:[1,2,3,4,5,6,7],12:[1,2,3,4,5,6,7],15:[1,2,3,4,5,6,7],
              18:[1,2,3,4,5,6,7],
             20:[1,2,3,4,5,6,7],25:[1,2,3,4,5,6,7],30:[1,2,3,4,5,6,7],40:[1,2,3,4,5,6,7],47:[1,2,3,4,5,6,7]}

scorer_names=['Accuracy','Precision','Recall', 
        'ROC AUC', 
        'AP', 'F1 score', 'Fbeta score']
df_KNN_PCA = pd.DataFrame(components, index=scorer_names)


df_KNN_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Precision,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Recall,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
ROC AUC,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
AP,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
F1 score,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
Fbeta score,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7


In [123]:
knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
for i in [1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    knn_clf.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= knn_clf.predict(X_val_scaled_pca)
    y_prob = knn_clf.predict_proba(X_val_scaled_pca)[:,1]

    
    df_KNN_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_KNN_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_KNN_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_KNN_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_KNN_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_KNN_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_KNN_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 1
[[16458  3085]
 [ 5666  2799]]
Accuracy: 0.6875535561268209
Precision: 0.4756968048946295
Recall: 0.3306556408741878
ROC_AUC: 0.6599356912055954
AP: 0.442916538727753
f1: 0.39013171649592304
fbeta: 0.4373300834348926

 n_components: 2
[[17812  1731]
 [ 4675  3790]]
Accuracy: 0.7712796343901742
Precision: 0.6864698424198514
Recall: 0.4477259303012404
ROC_AUC: 0.8015614862212301
AP: 0.692981127977704
f1: 0.541970541970542
fbeta: 0.6203149039248421

 n_components: 3
[[18057  1486]
 [ 4562  3903]]
Accuracy: 0.7840616966580977
Precision: 0.7242531081833364
Recall: 0.46107501476668633
ROC_AUC: 0.8189852391770986
AP: 0.7163063438479678
f1: 0.5634473798181031
fbeta: 0.6500449685220345

 n_components: 4
[[17978  1565]
 [ 4447  4018]]
Accuracy: 0.7853470437017995
Precision: 0.7196847572989432
Recall: 0.4746603662138216
ROC_AUC: 0.8279134604931182
AP: 0.7267594234813957
f1: 0.5720387243735763
fbeta: 0.6523362665194661

 n_components: 5
[[18066  1477]
 [ 4393  4072]]
Accuracy: 0.

In [126]:
df_KNN_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,68.8,77.1,78.4,78.5,79.0,79.1,79.3,79.5,79.3,79.3,79.3,79.2,79.4,79.5,79.6,79.7
Precision,47.6,68.6,72.4,72.0,73.4,73.3,73.2,73.5,73.2,73.3,73.3,73.5,73.7,74.3,74.5,74.6
Recall,33.1,44.8,46.1,47.5,48.1,48.7,49.9,50.2,49.4,49.3,49.5,49.0,49.4,49.4,49.7,49.8
ROC AUC,66.0,80.2,81.9,82.8,83.1,83.3,83.7,83.8,83.6,83.4,83.0,82.7,83.0,83.5,83.8,83.8
AP,44.3,69.3,71.6,72.7,73.1,73.4,73.9,74.0,73.6,73.3,72.5,71.9,72.8,73.6,74.2,74.2
F1 score,39.0,54.2,56.3,57.2,58.1,58.5,59.3,59.7,59.0,58.9,59.1,58.8,59.2,59.3,59.6,59.7
Fbeta score,43.7,62.0,65.0,65.2,66.4,66.5,66.9,67.3,66.8,66.8,66.9,66.8,67.1,67.5,67.7,67.8


For KNN, PCA gives very good values at n_components = 10. Then the values become slightly worse, before they start becoming better again for 
n_components >20. All in all, the ROC AUC score and the average precision are a bit better than in the standard approach in 
Capstone_main.

**Random forest optimized for the Fbeta score**

In [125]:

df_RFC_PCA = pd.DataFrame(components, index=scorer_names)


df_RFC_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Precision,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Recall,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
ROC AUC,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
AP,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
F1 score,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
Fbeta score,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7


In [127]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
       class_weight={0:0.45,1:0.55},   n_estimators=250)
for i in [1,2,3,4,5,6,8,10,12,15#,18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    rfc_clf.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= rfc_clf.predict(X_val_scaled_pca)
    y_prob = rfc_clf.predict_proba(X_val_scaled_pca)[:,1]

    
    df_RFC_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_RFC_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_RFC_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_RFC_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_RFC_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_RFC_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_RFC_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 1
[[17523  2020]
 [ 6466  1999]]
Accuracy: 0.697015138531848
Precision: 0.4973874098034337
Recall: 0.23614884819846427
ROC_AUC: 0.6712438553493093
AP: 0.44849019194854356
f1: 0.32024991989746876
fbeta: 0.40727761704902005

 n_components: 2
[[17761  1782]
 [ 5229  3236]]
Accuracy: 0.7496786632390745
Precision: 0.6448784376245517
Recall: 0.38227997637330186
ROC_AUC: 0.7562070088286392
AP: 0.5779723457541125
f1: 0.4800118667952236
fbeta: 0.5669832147738024

 n_components: 3
[[17998  1545]
 [ 4961  3504]]
Accuracy: 0.767709225935447
Precision: 0.6939988116458705
Recall: 0.41393975191966925
ROC_AUC: 0.7854471242008663
AP: 0.6333015954205452
f1: 0.518573331360071
fbeta: 0.6112836258330135

 n_components: 4
[[17900  1643]
 [ 4530  3935]]
Accuracy: 0.7795986860896886
Precision: 0.7054499820724274
Recall: 0.4648552864737153
ROC_AUC: 0.8013134379278868
AP: 0.6633676793538357
f1: 0.5604215623442285
fbeta: 0.6392760827890959

 n_components: 5
[[17980  1563]
 [ 4515  3950]]
Accuracy

In [128]:
df_RFC_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,69.7,75.0,76.8,78.0,78.3,78.7,79.1,79.1,79.1,79.0,1,1,1,1,1,1
Precision,49.7,64.5,69.4,70.5,71.6,72.9,73.2,72.4,72.7,73.0,2,2,2,2,2,2
Recall,23.6,38.2,41.4,46.5,46.7,47.1,48.4,50.1,49.4,48.4,3,3,3,3,3,3
ROC AUC,67.1,75.6,78.5,80.1,81.0,81.5,82.4,82.5,82.6,82.8,4,4,4,4,4,4
AP,44.8,57.8,63.3,66.3,67.5,68.2,69.6,69.9,70.0,70.2,5,5,5,5,5,5
F1 score,32.0,48.0,51.9,56.0,56.5,57.2,58.3,59.2,58.8,58.2,6,6,6,6,6,6
Fbeta score,40.7,56.7,61.1,63.9,64.7,65.7,66.4,66.5,66.4,66.2,7,7,7,7,7,7


In [129]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
       class_weight={0:0.45,1:0.55},   n_estimators=250)
for i in [#1,2,3,4,5,6,8,10,12,15#,
          18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    rfc_clf.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= rfc_clf.predict(X_val_scaled_pca)
    y_prob = rfc_clf.predict_proba(X_val_scaled_pca)[:,1]

    
    df_RFC_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_RFC_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_RFC_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_RFC_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_RFC_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_RFC_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_RFC_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 18
[[18096  1447]
 [ 4399  4066]]
Accuracy: 0.7912739217366467
Precision: 0.7375294757845093
Recall: 0.480330773774365
ROC_AUC: 0.8294707365124157
AP: 0.7050210449754184
f1: 0.5817713549864072
fbeta: 0.6661860602287251

 n_components: 20
[[18129  1414]
 [ 4423  4042]]
Accuracy: 0.7915952584975722
Precision: 0.7408357771260997
Recall: 0.4774955699940933
ROC_AUC: 0.829859308229065
AP: 0.705977491050411
f1: 0.5807054090941742
fbeta: 0.6672389316253424

 n_components: 25
[[18208  1335]
 [ 4431  4034]]
Accuracy: 0.7941302485004285
Precision: 0.7513503445706835
Recall: 0.47655050206733607
ROC_AUC: 0.8312681995650225
AP: 0.7112002435898108
f1: 0.5832008095995372
fbeta: 0.6736581944490831

 n_components: 30
[[18266  1277]
 [ 4486  3979]]
Accuracy: 0.7942373607540703
Precision: 0.7570395738203958
Recall: 0.4700531600708801
ROC_AUC: 0.8350536758432848
AP: 0.7165820767763721
f1: 0.5799868814226369
fbeta: 0.6746583471803046

 n_components: 40
[[18383  1160]
 [ 4487  3978]]
Accuracy

In [130]:
df_RFC_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,69.7,75.0,76.8,78.0,78.3,78.7,79.1,79.1,79.1,79.0,79.1,79.2,79.4,79.4,79.8,79.8
Precision,49.7,64.5,69.4,70.5,71.6,72.9,73.2,72.4,72.7,73.0,73.8,74.1,75.1,75.7,77.4,78.2
Recall,23.6,38.2,41.4,46.5,46.7,47.1,48.4,50.1,49.4,48.4,48.0,47.7,47.7,47.0,47.0,45.8
ROC AUC,67.1,75.6,78.5,80.1,81.0,81.5,82.4,82.5,82.6,82.8,82.9,83.0,83.1,83.5,84.0,84.1
AP,44.8,57.8,63.3,66.3,67.5,68.2,69.6,69.9,70.0,70.2,70.5,70.6,71.1,71.7,72.7,72.9
F1 score,32.0,48.0,51.9,56.0,56.5,57.2,58.3,59.2,58.8,58.2,58.2,58.1,58.3,58.0,58.5,57.8
Fbeta score,40.7,56.7,61.1,63.9,64.7,65.7,66.4,66.5,66.4,66.2,66.6,66.7,67.4,67.5,68.5,68.5


In contrast to K nearest neighbors, the scores do not have a strongly pronounced local maximum at n_components =10, 
but continue to rise for larger n_components. At n_components = 20 or n_components = 30, the values are comparable 
to the ones of the standard approach. 

**XGBoost optimized for accuracy**

In [131]:
df_XGB_accuracy_PCA = pd.DataFrame(components, index=scorer_names)


df_XGB_accuracy_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Precision,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Recall,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
ROC AUC,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
AP,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
F1 score,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
Fbeta score,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7


In [132]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
#       class_weight={0:0.45,1:0.55},   n_estimators=250)


#XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

from xgboost import XGBClassifier
xgb_clf_accuracy = XGBClassifier(random_state=7, scale_pos_weight=1, 
                     max_depth=19, min_child_weight=1, 
                     gamma=0,
                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

for i in [1,2,3,4,5,6,8,10,12,15#,
          #18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    xgb_clf_accuracy.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= xgb_clf_accuracy.predict(X_val_scaled_pca)
    y_prob = xgb_clf_accuracy.predict_proba(X_val_scaled_pca)[:,1]

    
    df_XGB_accuracy_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_XGB_accuracy_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_XGB_accuracy_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_XGB_accuracy_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_XGB_accuracy_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_XGB_accuracy_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_XGB_accuracy_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 1
[[18503  1040]
 [ 7292  1173]]
Accuracy: 0.702513567552128
Precision: 0.5300497062810664
Recall: 0.13857058476077969
ROC_AUC: 0.6695945775017025
AP: 0.44751701909264496
f1: 0.21970406443154153
fbeta: 0.3386845296529422

 n_components: 2
[[18070  1473]
 [ 6652  1813]]
Accuracy: 0.7099043130534133
Precision: 0.5517346317711503
Recall: 0.21417601890135854
ROC_AUC: 0.6844862763284585
AP: 0.472711335528793
f1: 0.3085694834482172
fbeta: 0.41950113378684806

 n_components: 3
[[18047  1496]
 [ 5640  2825]]
Accuracy: 0.7452156526706655
Precision: 0.6537838463318676
Recall: 0.3337271116361488
ROC_AUC: 0.7523311084143923
AP: 0.5741191087969879
f1: 0.441889566713593
fbeta: 0.5485649928152548

 n_components: 4
[[18098  1445]
 [ 5385  3080]]
Accuracy: 0.7561411025421308
Precision: 0.6806629834254143
Recall: 0.3638511518015357
ROC_AUC: 0.765706348721566
AP: 0.6044434732903861
f1: 0.4742109314857582
fbeta: 0.5797101449275361

 n_components: 5
[[18032  1511]
 [ 4994  3471]]
Accuracy: 

In [133]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
#       class_weight={0:0.45,1:0.55},   n_estimators=250)


#XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

from xgboost import XGBClassifier
xgb_clf_accuracy = XGBClassifier(random_state=7, scale_pos_weight=1, 
                     max_depth=19, min_child_weight=1, 
                     gamma=0,
                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

for i in [#1,2,3,4,5,6,8,10,12,15#,
          18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    xgb_clf_accuracy.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= xgb_clf_accuracy.predict(X_val_scaled_pca)
    y_prob = xgb_clf_accuracy.predict_proba(X_val_scaled_pca)[:,1]

    
    df_XGB_accuracy_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_XGB_accuracy_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_XGB_accuracy_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_XGB_accuracy_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_XGB_accuracy_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_XGB_accuracy_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_XGB_accuracy_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 18
[[17982  1561]
 [ 4178  4287]]
Accuracy: 0.7950942587832048
Precision: 0.7330711354309165
Recall: 0.5064382752510337
ROC_AUC: 0.8231495006437559
AP: 0.7042828996435695
f1: 0.5990358415426535
fbeta: 0.6728505508993314

 n_components: 20
[[17950  1593]
 [ 4235  4230]]
Accuracy: 0.7919165952584976
Precision: 0.7264296754250387
Recall: 0.49970466627288834
ROC_AUC: 0.821615355044697
AP: 0.7022037676271786
f1: 0.5921052631578948
fbeta: 0.6659948987624775

 n_components: 25
[[18066  1477]
 [ 4180  4285]]
Accuracy: 0.7980219937160811
Precision: 0.7436653939604304
Recall: 0.5062020082693444
ROC_AUC: 0.8270668713959213
AP: 0.7140104868977102
f1: 0.6023757643916496
fbeta: 0.6798781455272427

 n_components: 30
[[18057  1486]
 [ 4197  4268]]
Accuracy: 0.797093687517852
Precision: 0.7417448731317344
Recall: 0.5041937389249852
ROC_AUC: 0.8305673384623647
AP: 0.7210427866520837
f1: 0.6003235107954146
fbeta: 0.6778691909405674

 n_components: 40
[[18078  1465]
 [ 4169  4296]]
Accurac

In [134]:
df_XGB_accuracy_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,70.3,71.0,74.5,75.6,76.8,77.5,78.6,79.0,79.1,79.2,79.5,79.2,79.8,79.7,79.9,79.8
Precision,53.0,55.2,65.4,68.1,69.7,71.0,72.4,72.0,72.2,72.6,73.3,72.6,74.4,74.2,74.6,74.6
Recall,13.9,21.4,33.4,36.4,41.0,43.4,47.4,50.1,50.0,50.0,50.6,50.0,50.6,50.4,50.8,50.3
ROC AUC,67.0,68.4,75.2,76.6,78.8,79.9,81.4,81.4,81.9,82.1,82.3,82.2,82.7,83.1,83.6,83.7
AP,44.8,47.3,57.4,60.4,63.4,65.2,68.1,68.4,69.3,69.7,70.4,70.2,71.4,72.1,73.3,73.6
F1 score,22.0,30.9,44.2,47.4,51.6,53.9,57.3,59.1,59.1,59.2,59.9,59.2,60.2,60.0,60.4,60.1
Fbeta score,33.9,42.0,54.9,58.0,61.1,63.0,65.5,66.2,66.3,66.6,67.3,66.6,68.0,67.8,68.2,68.0


The scores have a slight local minimum around n_components = 20, but apart from that
they rise with rising n_components. The values at n_components = 20 are slightly worse than in the 
standard approach in Capstone_main.

**XGBoost optimized for the Fbeta score**

In [135]:
df_XGB_fbeta_PCA = pd.DataFrame(components, index=scorer_names)


df_XGB_fbeta_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Precision,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Recall,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
ROC AUC,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
AP,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
F1 score,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
Fbeta score,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7


In [136]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
#       class_weight={0:0.45,1:0.55},   n_estimators=250)


#XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

from xgboost import XGBClassifier

xgb_clf_fbeta = XGBClassifier(random_state=7, scale_pos_weight=0.65, 
                     max_depth=19, min_child_weight=1, 
                     gamma=0,
                     colsample_bytree=0.7, reg_alpha=0.000001,subsample=1, )

# = XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

for i in [1,2,3,4,5,6,8,10,12,15#,
          #18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    xgb_clf_fbeta.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= xgb_clf_fbeta.predict(X_val_scaled_pca)
    y_prob = xgb_clf_fbeta.predict_proba(X_val_scaled_pca)[:,1]

    
    df_XGB_fbeta_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_XGB_fbeta_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_XGB_fbeta_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_XGB_fbeta_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_XGB_fbeta_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_XGB_fbeta_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_XGB_fbeta_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 1
[[19182   361]
 [ 7997   468]]
Accuracy: 0.7015852613538989
Precision: 0.5645355850422196
Recall: 0.05528647371529829
ROC_AUC: 0.6689620226184863
AP: 0.4458546893042875
f1: 0.10071013557133635
fbeta: 0.19862490450725745

 n_components: 2
[[18888   655]
 [ 7537   928]]
Accuracy: 0.707512139388746
Precision: 0.5862286797220467
Recall: 0.10962787950383934
ROC_AUC: 0.6845484501001456
AP: 0.470323481578791
f1: 0.18471337579617833
fbeta: 0.31357707643441235

 n_components: 3
[[18728   815]
 [ 6650  1815]]
Accuracy: 0.733469008854613
Precision: 0.6901140684410646
Recall: 0.21441228588304784
ROC_AUC: 0.7480053782987333
AP: 0.5678140547736685
f1: 0.32717440288418204
fbeta: 0.4780089544377139

 n_components: 4
[[18700   843]
 [ 6355  2110]]
Accuracy: 0.7430019994287347
Precision: 0.7145275990518117
Recall: 0.2492616656822209
ROC_AUC: 0.7608897417024492
AP: 0.5927061307207357
f1: 0.3695918724820459
fbeta: 0.5202939290822113

 n_components: 5
[[18651   892]
 [ 5976  2489]]
Accura

In [137]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
#       class_weight={0:0.45,1:0.55},   n_estimators=250)


#XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

from xgboost import XGBClassifier

xgb_clf_fbeta = XGBClassifier(random_state=7, scale_pos_weight=0.65, 
                     max_depth=19, min_child_weight=1, 
                     gamma=0,
                     colsample_bytree=0.7, reg_alpha=0.000001,subsample=1, )

# = XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

for i in [#1,2,3,4,5,6,8,10,12,15#,
          18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    xgb_clf_fbeta.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= xgb_clf_fbeta.predict(X_val_scaled_pca)
    y_prob = xgb_clf_fbeta.predict_proba(X_val_scaled_pca)[:,1]

    
    df_XGB_fbeta_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_XGB_fbeta_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_XGB_fbeta_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_XGB_fbeta_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_XGB_fbeta_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_XGB_fbeta_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_XGB_fbeta_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 18
[[18388  1155]
 [ 4748  3717]]
Accuracy: 0.7892387889174521
Precision: 0.7629310344827587
Recall: 0.43910218546958063
ROC_AUC: 0.8209280766035512
AP: 0.6971135879994876
f1: 0.5573967159031267
fbeta: 0.6648660251135836

 n_components: 20
[[18331  1212]
 [ 4772  3693]]
Accuracy: 0.7863467580691231
Precision: 0.7529051987767584
Recall: 0.43626698168930894
ROC_AUC: 0.820121634033471
AP: 0.6958461651274068
f1: 0.5524308152580405
fbeta: 0.6574683995015133

 n_components: 25
[[18395  1148]
 [ 4662  3803]]
Accuracy: 0.7925592687803484
Precision: 0.7681276509796001
Recall: 0.4492616656822209
ROC_AUC: 0.825906258055638
AP: 0.708997298542237
f1: 0.5669350029815147
fbeta: 0.6726449467614701

 n_components: 30
[[18378  1165]
 [ 4559  3906]]
Accuracy: 0.7956298200514139
Precision: 0.7702622756852692
Recall: 0.4614294152392203
ROC_AUC: 0.82957139449172
AP: 0.7169881820599572
f1: 0.5771276595744681
fbeta: 0.679327976625274

 n_components: 40
[[18430  1113]
 [ 4504  3961]]
Accuracy: 

In [138]:
df_XGB_fbeta_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,70.2,70.8,73.3,74.3,75.5,76.1,77.3,78.0,78.2,78.8,78.9,78.6,79.3,79.6,79.9,79.9
Precision,56.5,58.6,69.0,71.5,73.6,74.4,75.0,75.0,75.4,76.2,76.3,75.3,76.8,77.0,78.1,78.0
Recall,5.5,11.0,21.4,24.9,29.4,31.9,37.3,40.8,41.5,43.2,43.9,43.6,44.9,46.1,46.8,46.7
ROC AUC,66.9,68.5,74.8,76.1,78.5,79.4,80.8,81.1,81.6,81.9,82.1,82.0,82.6,83.0,83.6,83.8
AP,44.6,47.0,56.8,59.3,62.7,64.3,66.9,67.3,68.5,69.2,69.7,69.6,70.9,71.7,73.1,73.4
F1 score,10.1,18.5,32.7,37.0,42.0,44.6,49.8,52.8,53.6,55.2,55.7,55.2,56.7,57.7,58.5,58.4
Fbeta score,19.9,31.4,47.8,52.0,56.6,58.8,62.4,64.2,64.8,66.1,66.5,65.7,67.3,67.9,68.9,68.8


Again the scores have a slight local minimum near n_components = 20, where they are worse than in the 
standard approach of Capstone_main. For larger n_components, the scores still increase.

**XGBoost optimized for the ROC AUC score**

In [139]:
df_XGB_roc_auc_PCA = pd.DataFrame(components, index=scorer_names)


df_XGB_roc_auc_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Precision,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Recall,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
ROC AUC,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
AP,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
F1 score,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
Fbeta score,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7


In [140]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
#       class_weight={0:0.45,1:0.55},   n_estimators=250)


#XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

from xgboost import XGBClassifier
xgb_clf_roc_auc = XGBClassifier(random_state=7, scale_pos_weight=18, 
                     max_depth=47, min_child_weight=2, 
                     gamma=0,
                     colsample_bytree=1,reg_alpha= 0.00001, subsample=0.9, )

for i in [1,2,3,4,5,6,8,10,12,15#,
          #18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    xgb_clf_roc_auc.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= xgb_clf_roc_auc.predict(X_val_scaled_pca)
    y_prob = xgb_clf_roc_auc.predict_proba(X_val_scaled_pca)[:,1]

    
    df_XGB_roc_auc_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_XGB_roc_auc_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_XGB_roc_auc_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_XGB_roc_auc_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_XGB_roc_auc_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_XGB_roc_auc_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_XGB_roc_auc_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 1
[[ 6233 13310]
 [ 1124  7341]]
Accuracy: 0.48464724364467293
Precision: 0.3554791535518861
Recall: 0.8672179562906084
ROC_AUC: 0.6569551976786524
AP: 0.4378168828972317
f1: 0.5042588267619178
fbeta: 0.4030460420121007

 n_components: 2
[[10507  9036]
 [ 1893  6572]]
Accuracy: 0.6097900599828621
Precision: 0.4210661199384931
Recall: 0.7763733018310691
ROC_AUC: 0.7380206985374822
AP: 0.5613966221183779
f1: 0.5460058987247124
fbeta: 0.46348928727590727

 n_components: 3
[[12317  7226]
 [ 2097  6368]]
Accuracy: 0.6671308197657813
Precision: 0.4684419596880977
Recall: 0.7522740696987597
ROC_AUC: 0.7715383367598776
AP: 0.6280369048541143
f1: 0.5773607144476178
fbeta: 0.50667557804618

 n_components: 4
[[13123  6420]
 [ 2128  6337]]
Accuracy: 0.6948014852899171
Precision: 0.4967468840636513
Recall: 0.7486119314825753
ROC_AUC: 0.7893814264327358
AP: 0.6631892357220337
f1: 0.5972104419941571
fbeta: 0.532583665305162

 n_components: 5
[[13779  5764]
 [ 2212  6253]]
Accuracy: 0.

In [141]:
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')
#rfc_clf=     RandomForestClassifier(random_state=7, max_features='auto',       criterion='gini', max_depth=23, 
#       class_weight={0:0.45,1:0.55},   n_estimators=250)


#XGBClassifier(random_state=7, scale_pos_weight=1, 
#                     max_depth=19, min_child_weight=1, 
#                     gamma=0,
#                     colsample_bytree=0.7,reg_alpha= 0.0001, subsample=1, )

from xgboost import XGBClassifier
xgb_clf_roc_auc = XGBClassifier(random_state=7, scale_pos_weight=18, 
                     max_depth=47, min_child_weight=2, 
                     gamma=0,
                     colsample_bytree=1,reg_alpha= 0.00001, subsample=0.9, )

for i in [#1,2,3,4,5,6,8,10,12,15#,
          18,20,25,30,40,47
         ]:
    pca = PCA(n_components =i,random_state=7)
    X_train_scaled_pca = pca.fit_transform(X_train_scaled)
    X_val_scaled_pca = pca.transform(X_val_scaled)
    xgb_clf_roc_auc.fit(X_train_scaled_pca,y_train)
    
    
    y_pred= xgb_clf_roc_auc.predict(X_val_scaled_pca)
    y_prob = xgb_clf_roc_auc.predict_proba(X_val_scaled_pca)[:,1]

    
    df_XGB_roc_auc_PCA.loc['Accuracy',i]=(round(accuracy_score(y_val, y_pred),3))*100
    df_XGB_roc_auc_PCA.loc['Precision',i]=(round(precision_score(y_val, y_pred),3))*100
    
    
    df_XGB_roc_auc_PCA.loc['Recall',i]=(round(recall_score(y_val, y_pred),3))*100
    df_XGB_roc_auc_PCA.loc['ROC AUC',i]=(round(roc_auc_score(y_val, y_prob),3))*100
    df_XGB_roc_auc_PCA.loc['AP',i]=(round(average_precision_score(y_val, y_prob),3))*100
    df_XGB_roc_auc_PCA.loc['F1 score',i]=(round(f1_score(y_val, y_pred),3))*100
    df_XGB_roc_auc_PCA.loc['Fbeta score',i]=(round(fbeta_score(y_val, y_pred, beta=0.5),3))*100

    
    
    
    print(f"\n n_components: {i}")
    cm = metrics.confusion_matrix(y_val,y_pred)
    print(cm)
    print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
    print(f"AP: {average_precision_score(y_val,y_prob)}")
    print(f"f1: {f1_score(y_val,y_pred)}")
    print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")
    


 n_components: 18
[[15496  4047]
 [ 2631  5834]]
Accuracy: 0.7615681233933161
Precision: 0.5904260702358061
Recall: 0.6891907855877141
ROC_AUC: 0.8234718939099233
AP: 0.7157150424990667
f1: 0.6359969475635016
fbeta: 0.6078476317489424

 n_components: 20
[[15521  4022]
 [ 2621  5844]]
Accuracy: 0.7628177663524707
Precision: 0.5923373200891953
Recall: 0.6903721204961607
ROC_AUC: 0.8249700246014219
AP: 0.7179134588411031
f1: 0.637608422890186
fbeta: 0.6096517765862004

 n_components: 25
[[15581  3962]
 [ 2681  5784]]
Accuracy: 0.7628177663524707
Precision: 0.593474245844449
Recall: 0.6832841110454814
ROC_AUC: 0.8255183573115868
AP: 0.7209110753849517
f1: 0.6352204711438142
fbeta: 0.6094965120445108

 n_components: 30
[[15645  3898]
 [ 2693  5772]]
Accuracy: 0.7646743787489289
Precision: 0.5968976215098242
Recall: 0.6818665091553455
ROC_AUC: 0.8288261101672327
AP: 0.7261023590743101
f1: 0.6365591397849463
fbeta: 0.6121539930003181

 n_components: 40
[[15827  3716]
 [ 2650  5815]]
Accuracy

In [142]:
df_XGB_roc_auc_PCA

Unnamed: 0,1,2,3,4,5,6,8,10,12,15,18,20,25,30,40,47
Accuracy,48.5,61.0,66.7,69.5,71.5,72.0,74.0,74.8,75.0,75.5,76.2,76.3,76.3,76.5,77.3,77.2
Precision,35.5,42.1,46.8,49.7,52.0,52.6,55.4,56.6,57.0,57.8,59.0,59.2,59.3,59.7,61.0,61.1
Recall,86.7,77.6,75.2,74.9,73.9,72.5,71.2,71.5,70.4,69.7,68.9,69.0,68.3,68.2,68.7,67.9
ROC AUC,65.7,73.8,77.2,78.9,80.0,80.4,81.5,81.7,81.9,82.2,82.3,82.5,82.6,82.9,83.6,83.5
AP,43.8,56.1,62.8,66.3,67.6,68.3,70.3,70.6,70.8,71.3,71.6,71.8,72.1,72.6,73.7,73.6
F1 score,50.4,54.6,57.7,59.7,61.1,61.0,62.3,63.2,63.0,63.2,63.6,63.8,63.5,63.7,64.6,64.3
Fbeta score,40.3,46.3,50.7,53.3,55.3,55.7,58.0,59.0,59.3,59.9,60.8,61.0,60.9,61.2,62.4,62.4


The scores now seem to have a local maximum at n_components = 40. At n_components they are slightly worse 
than in the standard approach of Capstone_main.

## Voting classifier

We combine some models with n_components = 20 into a voting classifier.

**Hard voting**

In [144]:
from sklearn.ensemble import VotingClassifier
#rfc_clf = RandomForestClassifier(random_state=7, n_estimators = 200, max_features='auto', max_depth=32, 
#                            criterion='entropy', class_weight= {0: 0.42, 1: 0.58})
#xgb_clf = XGBClassifier(random_state=7, scale_pos_weight=0.9, max_depth=19, min_child_weight=2, gamma=0, 
#                     subsample=1, colsample_bytree=0.5, reg_alpha= 0.0001)
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')

voting_clf = VotingClassifier(estimators = [('rf',rfc_clf), ('xg',xgb_clf_accuracy), ('kn',knn_clf)], voting = 'hard')

pca = PCA(n_components =20,random_state=7)
X_train_scaled_pca = pca.fit_transform(X_train_scaled)
X_val_scaled_pca = pca.transform(X_val_scaled)
    #xgb_clf_roc_auc.fit(X_train_scaled_pca,y_train)
    
    
    #y_pred= xgb_clf_roc_auc.predict(X_val_scaled_pca)
    #y_prob = xgb_clf_roc_auc.predict_proba(X_val_scaled_pca)[:,1]

voting_clf.fit(X_train_scaled_pca,y_train)
y_pred = voting_clf.predict(X_val_scaled_pca)
#y_prob = voting_clf.predict_proba(X_val_scaled_pca)[:,1]


cm = metrics.confusion_matrix(y_val,y_pred)
print(cm)
print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
print(f"Precision: {precision_score(y_val, y_pred)}")
print(f"Recall: {recall_score(y_val, y_pred)}")
#print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
#print(f"AP: {average_precision_score(y_val,y_prob)}")
print(f"f1: {f1_score(y_val,y_pred)}")
print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")  

[[18130  1413]
 [ 4297  4168]]
Accuracy: 0.7961296772350757
Precision: 0.7468195663859524
Recall: 0.4923803898405198
f1: 0.5934785704115051
fbeta: 0.6768651141641496


The values are good, but not as good as some models in the standard approach in Capstone_main.

**Soft voting**

In [145]:
from sklearn.ensemble import VotingClassifier
#rfc_clf = RandomForestClassifier(random_state=7, n_estimators = 200, max_features='auto', max_depth=32, 
#                            criterion='entropy', class_weight= {0: 0.42, 1: 0.58})
#xgb_clf = XGBClassifier(random_state=7, scale_pos_weight=0.9, max_depth=19, min_child_weight=2, gamma=0, 
#                     subsample=1, colsample_bytree=0.5, reg_alpha= 0.0001)
#knn_clf = KNeighborsClassifier(metric='minkowski', n_neighbors= 32, p= 1, weights= 'distance')

voting_clf = VotingClassifier(estimators = [('rf',rfc_clf), ('xg',xgb_clf_accuracy), ('kn',knn_clf)], voting = 'soft')

pca = PCA(n_components =20,random_state=7)
X_train_scaled_pca = pca.fit_transform(X_train_scaled)
X_val_scaled_pca = pca.transform(X_val_scaled)
    #xgb_clf_roc_auc.fit(X_train_scaled_pca,y_train)
    
    
    #y_pred= xgb_clf_roc_auc.predict(X_val_scaled_pca)
    #y_prob = xgb_clf_roc_auc.predict_proba(X_val_scaled_pca)[:,1]

voting_clf.fit(X_train_scaled_pca,y_train)
y_pred = voting_clf.predict(X_val_scaled_pca)
y_prob = voting_clf.predict_proba(X_val_scaled_pca)[:,1]


cm = metrics.confusion_matrix(y_val,y_pred)
print(cm)
print(f"Accuracy: {accuracy_score(y_val, y_pred) }")
print(f"Precision: {precision_score(y_val, y_pred)}")
print(f"Recall: {recall_score(y_val, y_pred)}")
print(f"ROC_AUC: {roc_auc_score(y_val, y_prob)}")
print(f"AP: {average_precision_score(y_val,y_prob)}")
print(f"f1: {f1_score(y_val,y_pred)}")
print(f"fbeta: {fbeta_score(y_val,y_pred,beta=0.5)}")  

[[18137  1406]
 [ 4307  4158]]
Accuracy: 0.7960225649814339
Precision: 0.7473040977713875
Recall: 0.4911990549320732
ROC_AUC: 0.8337997308190922
AP: 0.7213709275530613
f1: 0.59277211490484
fbeta: 0.6767357833403861


The values are good, but some of the results in the standard approach of Capstone_main are better.

All in all, PCA gives some interesting results, in particular for K nearest neighbors, but the scores are not better or even slightly wors 
than in the standard approach of 
Capstone_main for the other algorithms.