## Dimensionality Reduction

Objective: create the dataset with Factor Analysis with mixed types and evaluate whether it should be included as part of modeling pipelines

In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [2]:
import sklearn 
sklearn.__version__

'0.20.3'

In [67]:
from sklearn.decomposition import PCA
from yellowbrick.features.pca import PCADecomposition
from sklearn.preprocessing import StandardScaler
import prince
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [4]:
df = pd.read_csv('df.csv')

In [5]:
df.shape

(421095, 61)

In [6]:
famd = prince.FAMD(
n_components=5,
n_iter=3,
copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)

In [7]:
df.select_dtypes(include='object').columns

Index(['addr_state', 'emp_length', 'emp_title', 'grade', 'home_ownership',
       'purpose', 'sub_grade', 'title', 'verification_status', 'zip_code'],
      dtype='object')

In [8]:
for i in df.select_dtypes(include='object').columns:
    df[i].fillna('unknown', inplace=True)

In [9]:
df.emp_length.unique()

array(['10+ years', '1 year', '2 years', '< 1 year', '8 years', '5 years',
       '3 years', '9 years', '7 years', '4 years', '6 years', 'unknown'],
      dtype=object)

In [10]:
df.fillna(1, inplace=True)

In [11]:
#df['cat'] = le.fit_transform(df[cat].astype(str))

In [12]:
y = df['isdefault']
X = df.drop('isdefault', axis='columns')

In [13]:
#is instances of prince.MFA and inherits its methods
#famd = famd.fit(X)  # No need for 'Oak type'

In [14]:
#df_redu = famd.transform(X.drop('Oak type', axis='columns'))

In [15]:
#famd.eigenvalues_

In [113]:
import random

In [114]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split as tts

In [115]:
from sklearn.preprocessing import OneHotEncoder

In [116]:
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=2, flip_y=0,
n_features=5, n_clusters_per_class=1, n_samples=100, random_state=10)

In [117]:
X = pd.DataFrame(X)

In [118]:
lst = [random.choice("abcde") for _ in range(len(X))]

In [119]:
X['cat'] = pd.Series(lst)

In [120]:
X.columns = ['num1', 'num2', 'num3', 'num4', 'num5', 'cat']

In [121]:
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['num1', 'num2', 'num3', 'num4', 'num5']),
        ('cat', categorical_transformer, ['cat']),
        #('bin', binary_transformer, bin_features), 
        #('lab_bin', label_binary_transformer, lab_bin_features) 
    ], 
        #setting remainder to passthrough so features that are not included are not dropped
        remainder='passthrough')

In [122]:
#famd = prince.FAMD(n_components=5,n_iter=3,copy=True,check_input=True,engine='auto',random_state=42)
smt = SMOTE(random_state=42)
knn = KNN(n_neighbors=3)

In [123]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipe = Pipeline(steps=[('preprocessor', preprocessor),('smt', smt), ('knn', KNN(n_neighbors=3))])

In [124]:
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

In [125]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       ...ki',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform'))])

In [129]:
pipe.score(X_test,y_test)

1.0

In [131]:
df.select_dtypes(exclude=['int', 'float']).columns

Index(['addr_state', 'emp_length', 'emp_title', 'grade', 'home_ownership',
       'purpose', 'sub_grade', 'title', 'verification_status', 'zip_code'],
      dtype='object')