In [246]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

pca = PCA()

In [247]:
# PCA on continuous numeric features

# Clean the data
#### This section will not be necessary once we have the fnalized, cleaned data

# Grab the raw data
train = pd.read_csv("train.csv", parse_dates=['timestamp']).drop(['id'],axis=1)
test_raw= pd.read_csv("test.csv", parse_dates=['timestamp']).drop(['id'],axis=1)
macro= pd.read_csv("macro.csv", parse_dates=['timestamp'])


# Merge the data (if we choose to merge a lag on the macro, this could be even better)
train['dataset'] = 'train'
test_raw['dataset'] = 'test'
df = pd.concat([train, test_raw])
df = pd.merge(df, macro, on = 'timestamp', how='left')


# Log transform skewed numeric features 
get_col = df.dtypes[(df.dtypes == "int64") | (df.dtypes == "float64")].index
get_skews = df[get_col].apply(lambda x: skew(x.dropna()))
get_skews = get_skews[get_skews>0.5]
get_skews = get_skews.index
df[get_skews] = np.log1p(df[get_skews])    


# select continuous numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_df = df.select_dtypes(include=numerics)
num_df.shape # 364 features


# impute missing values with mean
df = num_df.apply(lambda x: x.fillna(x.mean()),axis=0) # newdf is the numeric columns


# scale the data frame
df = (df - df.mean()) / (df.max() - df.min())



# PC for market

In [248]:
# one ==> 96%, two ==> 98%

market = ['market_count_500','market_count_5000', 'market_count_2000',
          'market_count_1000','market_count_1500','market_count_3000']    
df_pc = df[market]


#set params & fit model
pca.set_params(n_components=1)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['market_pc_1'])  # for first one, create a new dataframe
df = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.71537854] & sum =  0.715378536052


# PC for cafes (not useful)

In [250]:
# # maybe not a good one  (4 to get over 90%)

# cafe = ['cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
#         'cafe_avg_price_500', 'cafe_sum_1000_min_price_avg','cafe_sum_1000_max_price_avg', 
#         'cafe_avg_price_1000', 'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg', 
#         'cafe_avg_price_1500', 'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg', 
#         'cafe_avg_price_2000', 'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg',
#         'cafe_avg_price_3000',  'cafe_sum_5000_min_price_avg', 'cafe_sum_5000_max_price_avg',
#         'cafe_avg_price_5000','cafe_count_5000_price_high','cafe_count_500', 
#         'cafe_count_500_na_price','cafe_count_500_price_500', 'cafe_count_500_price_1000',
#         'cafe_count_500_price_1500', 'cafe_count_500_price_2500',
#         'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'cafe_count_1000', 
#         'cafe_count_1000_na_price', 'cafe_count_1000_price_500',
#         'cafe_count_1000_price_1000', 'cafe_count_1000_price_1500',
#         'cafe_count_1000_price_2500', 'cafe_count_1000_price_4000',
#         'cafe_count_1000_price_high','cafe_count_1500', 'cafe_count_1500_na_price', 
#         'cafe_count_1500_price_500', 'cafe_count_1500_price_1000',
#         'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500',
#         'cafe_count_1500_price_4000', 'cafe_count_1500_price_high', 'cafe_count_2000', 
#         'cafe_count_2000_na_price', 'cafe_count_2000_price_500',
#         'cafe_count_2000_price_1000', 'cafe_count_2000_price_1500',
#         'cafe_count_2000_price_2500', 'cafe_count_2000_price_4000',
#         'cafe_count_2000_price_high', 'cafe_count_3000', 'cafe_count_3000_na_price',
#         'cafe_count_3000_price_500', 'cafe_count_3000_price_1000',
#         'cafe_count_3000_price_1500', 'cafe_count_3000_price_2500',
#         'cafe_count_3000_price_4000', 'cafe_count_3000_price_high','cafe_count_5000',
#         'cafe_count_5000_na_price', 'cafe_count_5000_price_500',
#         'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500',
#         'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000', 'cafe_count_5000_price_high']

# df_pc = df[cafe]


# #set params & fit model
# pca.set_params(n_components=4)
# pca.fit(df_pc)

# # find variance contained
# print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# # give eigenvalues
  

# # apply projection
# num_df2 = pca.transform(df_pc)


# # Join back with original data
# num_df2 = pd.DataFrame(num_df2)
# df  = pd.concat([df, num_df2], axis = 1)

# PC green

In [251]:
# 2 for 90%, 3 for 97%

green = ['green_part_500', 'green_part_1000','green_part_1500',
         'green_part_2000','green_part_3000','green_part_5000']


df_pc = df[green]


#set params & fit model
pca.set_params(n_components=3)
pca.fit(df_pc)

# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['green_pc_1', 'green_pc_2', 'green_pc_3'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.59585927  0.29209367  0.07902603] & sum =  0.96697897201


# PC Industrial

In [252]:
# 2 for 89.7%, 3 for 96%

prom = ['prom_part_500','prom_part_1000','prom_part_1500',
        'prom_part_2000','prom_part_3000','prom_part_5000']

df_pc = df[prom]

#set params & fit model
pca.set_params(n_components=3)
pca.fit(df_pc)

# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['prom_pc_1', 'prom_pc_2', 'prom_pc_3'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.71203615  0.168758    0.07204646] & sum =  0.952840608634


# PC office

In [253]:
# 3 for 90%, 4 for 95% 

office = ['office_count_500','office_sqm_500','office_count_1000',
          'office_sqm_1000','office_count_1500', 'office_sqm_1500',
          'office_count_2000','office_sqm_2000','office_count_3000',
          'office_sqm_3000','office_count_5000','office_sqm_5000']
  

df_pc = df[office]

#set params & fit model
pca.set_params(n_components=3)
pca.fit(df_pc)

# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['office_pc_1', 'office_pc_2', 'office_pc_3'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.73173197  0.10750383  0.06000975] & sum =  0.899245554034


# PC cultural (not useful)

In [254]:
# # maybe not a good one, needs 8 for over 90%


# # grouped cultural characteristics
# cult_chars = ['sport_objects_raion', 'culture_objects_top_25_raion', 'shopping_centers_raion', 'park_km', 'fitness_km', 
#                 'swim_pool_km', 'ice_rink_km','stadium_km', 'basketball_km', 'shopping_centers_km', 'big_church_km',
#                 'church_synagogue_km', 'mosque_km', 'theater_km', 'museum_km', 'exhibition_km', 'catering_km']

# df_pc = df[cult_chars]


# pca.set_params(n_components=7)
# pca.fit(df_pc)

# print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))  
# # give eigenvalues

# # df['cafe_pca'] = pca.transform(df_pc)

# PC shopping malls

In [255]:
# 3 for 91%, 4 for 95%

trc = ['trc_count_1000', 'trc_count_1500', 'trc_count_2000', 'trc_count_3000', 
       'trc_count_500', 'trc_count_5000', 'trc_sqm_1000', 'trc_sqm_1500',
       'trc_sqm_2000', 'trc_sqm_3000', 'trc_sqm_500', 'trc_sqm_5000', 'trc_count_1000', 'trc_sqm_1000']

df_pc = df[trc]


#set params & fit model
pca.set_params(n_components=3)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns=['trc_pc_1', 'trc_pc_2', 'trc_pc_3'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.69485217  0.12380419  0.07313241] & sum =  0.891788768601


# PC religious buildings

In [256]:
# 3 for 89.6%, 7 for 95%

church = ['big_church_count_500', 'church_count_500', 'mosque_count_500',
         'big_church_count_1000', 'church_count_1000', 'mosque_count_1000',
         'big_church_count_1500', 'church_count_1500', 'mosque_count_1500',
         'big_church_count_3000', 'church_count_3000', 'mosque_count_3000',
         'big_church_count_5000', 'church_count_5000', 'mosque_count_5000',
         'big_church_count_2000', 'church_count_2000', 'mosque_count_2000']

df_pc = df[church]


#set params & fit model
pca.set_params(n_components=3)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['church_pc_1', 'church_pc_2', 'church_pc_3'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.55307382  0.15573631  0.08765663] & sum =  0.796466758515


# PC sports

In [257]:
# 2 for 93.8%, 3 for 96.7%

sport = ['sport_count_500','sport_count_1000','sport_count_2000', 
         'sport_count_5000','sport_count_1500','sport_count_3000']

df_pc = df[sport]


#set params & fit model
pca.set_params(n_components=2)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['sport_pc_1', 'sport_pc_2'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.82110102  0.10705881] & sum =  0.928159827462


# PC leisure

In [258]:
# 2 for 95%

leisure = ['leisure_count_500','leisure_count_3000','leisure_count_1000',
           'leisure_count_1500','leisure_count_2000','leisure_count_5000']


df_pc = df[leisure]


#set params & fit model
pca.set_params(n_components=2)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['leisure_pc_1', 'leisure_pc_2'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.83712373  0.09507349] & sum =  0.932197219427


# PC schools

In [259]:
# 2 for 92%, 3 for 95.1%

school = ['children_preschool', 'preschool_quota', 'preschool_education_centers_raion', 'children_school', 
                'school_quota', 'school_education_centers_raion', 'school_education_centers_top_20_raion', 
                'university_top_20_raion', 'additional_education_raion', 'additional_education_km', 'university_km']


df_pc = df[school]


#set params & fit model
pca.set_params(n_components=3)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['school_pc_1', 'school_pc_2', 'school_pc_3'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.55939771  0.13198228  0.0900099 ] & sum =  0.78138989746


# PC cafe price

In [260]:
# 5 for 92%

cafe_price = ['cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
              'cafe_avg_price_500', 'cafe_sum_1000_min_price_avg','cafe_sum_1000_max_price_avg', 
              'cafe_avg_price_1000', 'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg', 
              'cafe_avg_price_1500', 'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg', 
              'cafe_avg_price_2000', 'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg',
              'cafe_avg_price_3000',  'cafe_sum_5000_min_price_avg', 'cafe_sum_5000_max_price_avg',
              'cafe_avg_price_5000', 'cafe_count_5000_price_high']      

df_pc = df[cafe_price]


#set params & fit model
pca.set_params(n_components=5)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, 
                       columns = ['cafe_price_pc_1', 'cafe_price_pc2', 
                                  'cafe_price_pc_3', 'cafe_price_pc_4', 
                                  'cafe_price_pc_5'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.44775356  0.17121346  0.1518611   0.1001934   0.05726982] & sum =  0.928291332225


# PC cafe count

In [261]:
# 3 for 90.9%, 8 to get over 95%

cafe_count = ['cafe_count_500', 'cafe_count_500_na_price', 'cafe_count_500_price_500', 
              'cafe_count_500_price_1000', 'cafe_count_500_price_1500', 'cafe_count_500_price_2500',
              'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'cafe_count_1000', 
              'cafe_count_1000_na_price', 'cafe_count_1000_price_500', 'cafe_count_1000_price_1000',
              'cafe_count_1000_price_1500', 'cafe_count_1000_price_2500', 'cafe_count_1000_price_4000',
              'cafe_count_1000_price_high','cafe_count_1500', 'cafe_count_1500_na_price', 'cafe_count_1500_price_500',
              'cafe_count_1500_price_1000', 'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500',
              'cafe_count_1500_price_4000', 'cafe_count_1500_price_high', 'cafe_count_2000', 
              'cafe_count_2000_na_price', 'cafe_count_2000_price_500', 'cafe_count_2000_price_1000',
              'cafe_count_2000_price_1500', 'cafe_count_2000_price_2500', 'cafe_count_2000_price_4000',
              'cafe_count_2000_price_high', 'cafe_count_3000', 'cafe_count_3000_na_price',
              'cafe_count_3000_price_500', 'cafe_count_3000_price_1000', 'cafe_count_3000_price_1500', 
              'cafe_count_3000_price_2500', 'cafe_count_3000_price_4000', 'cafe_count_3000_price_high',
              'cafe_count_5000', 'cafe_count_5000_na_price', 'cafe_count_5000_price_500',
              'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500', 'cafe_count_5000_price_2500', 
              'cafe_count_5000_price_4000', 'cafe_count_5000_price_high'] 

df_pc = df[cafe_count]


#set params & fit model
pca.set_params(n_components=3)
pca.fit(df_pc)


# find variance contained
print('Variance Ratio: ' + str(pca.explained_variance_ratio_) + ' & sum =  ' + str(sum(pca.explained_variance_ratio_)))
# give eigenvalues
  

# apply projection
num_df2 = pca.transform(df_pc)


# Join back with original data
num_df2 = pd.DataFrame(num_df2, columns = ['cafe_count_pc_1', 'cafe_count_pc_2', 'cafe_count_pc_3'])
df  = pd.concat([df, num_df2], axis = 1)

Variance Ratio: [ 0.77123331  0.07251101  0.03472481] & sum =  0.878469128477


# Drop Everything!

In [262]:
# im sure there was a better way to do this 


drop = ['green_part_500', 'green_part_1000','green_part_1500',
        'green_part_2000','green_part_3000','green_part_5000', 
        'prom_part_500','prom_part_1000','prom_part_1500',
        'prom_part_2000','prom_part_3000','prom_part_5000', 
        'office_count_500','office_sqm_500','office_count_1000',
        'office_sqm_1000','office_count_1500', 'office_sqm_1500',
        'office_count_2000','office_sqm_2000','office_count_3000',
        'office_sqm_3000','office_count_5000','office_sqm_5000', 
        'sport_objects_raion', 'culture_objects_top_25_raion', 'shopping_centers_raion', 'park_km', 'fitness_km', 
        'swim_pool_km', 'ice_rink_km','stadium_km', 'basketball_km', 'shopping_centers_km', 'big_church_km',
        'church_synagogue_km', 'mosque_km', 'theater_km', 'museum_km', 'exhibition_km', 'catering_km',
        'trc_count_1000', 'trc_count_1500', 'trc_count_2000', 'trc_count_3000', 
        'trc_count_500', 'trc_count_5000', 'trc_sqm_1000', 'trc_sqm_1500',
        'trc_sqm_2000', 'trc_sqm_3000', 'trc_sqm_500', 'trc_sqm_5000', 'trc_count_1000', 'trc_sqm_1000',
        'big_church_count_500', 'church_count_500', 'mosque_count_500',
        'big_church_count_1000', 'church_count_1000', 'mosque_count_1000',
        'big_church_count_1500', 'church_count_1500', 'mosque_count_1500',
        'big_church_count_3000', 'church_count_3000', 'mosque_count_3000',
        'big_church_count_5000', 'church_count_5000', 'mosque_count_5000',
        'big_church_count_2000', 'church_count_2000', 'mosque_count_2000',
        'sport_count_500','sport_count_1000','sport_count_2000', 
        'sport_count_5000','sport_count_1500','sport_count_3000',
        'leisure_count_500','leisure_count_3000','leisure_count_1000',
        'leisure_count_1500','leisure_count_2000','leisure_count_5000',
        'market_count_500','market_count_5000', 'market_count_2000',
        'market_count_1000','market_count_1500','market_count_3000',
        'children_preschool', 'preschool_quota', 'preschool_education_centers_raion', 'children_school', 
        'school_quota', 'school_education_centers_raion', 'school_education_centers_top_20_raion', 
        'university_top_20_raion', 'additional_education_raion', 'additional_education_km', 'university_km',
        'cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
        'cafe_avg_price_500', 'cafe_sum_1000_min_price_avg','cafe_sum_1000_max_price_avg', 
        'cafe_avg_price_1000', 'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg', 
        'cafe_avg_price_1500', 'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg', 
        'cafe_avg_price_2000', 'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg',
        'cafe_avg_price_3000',  'cafe_sum_5000_min_price_avg', 'cafe_sum_5000_max_price_avg',
        'cafe_avg_price_5000','cafe_count_5000_price_high',
        'cafe_count_500', 'cafe_count_500_na_price',
        'cafe_count_500_price_500', 'cafe_count_500_price_1000',
        'cafe_count_500_price_1500', 'cafe_count_500_price_2500',
        'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'cafe_count_1000', 
        'cafe_count_1000_na_price', 'cafe_count_1000_price_500',
        'cafe_count_1000_price_1000', 'cafe_count_1000_price_1500',
        'cafe_count_1000_price_2500', 'cafe_count_1000_price_4000',
        'cafe_count_1000_price_high','cafe_count_1500','cafe_count_1500_na_price',
        'cafe_count_1500_price_500', 'cafe_count_1500_price_1000',
        'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500',
        'cafe_count_1500_price_4000', 'cafe_count_1500_price_high', 'cafe_count_2000', 
        'cafe_count_2000_na_price', 'cafe_count_2000_price_500',
        'cafe_count_2000_price_1000', 'cafe_count_2000_price_1500',
        'cafe_count_2000_price_2500', 'cafe_count_2000_price_4000',
        'cafe_count_2000_price_high', 'cafe_count_3000', 'cafe_count_3000_na_price',
        'cafe_count_3000_price_500', 'cafe_count_3000_price_1000',
        'cafe_count_3000_price_1500', 'cafe_count_3000_price_2500',
        'cafe_count_3000_price_4000', 'cafe_count_3000_price_high','cafe_count_5000',
        'cafe_count_5000_na_price', 'cafe_count_5000_price_500',
        'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500',
        'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000',
        'cafe_count_5000_price_high'] 


In [263]:
df = df.drop(df[drop], axis = 1)

df.to_csv('pca.csv')

# PC all components (not useful)

In [264]:
# pca = PCA()
# pca.set_params(n_components = 30) 
# pca.fit(num_df)

# print('Variance Ratio: ' + str(pca.explained_variance_ratio_) +
#       ' & sum = ' + str(sum(pca.explained_variance_ratio_)))

# num_df2 = pca.transform(num_df)
# print(num_df.shape)
# print(num_df2.shape)
# num_df
# pca.n_components_

In [265]:
# # print(df.shape)

# df_3 = df
# num_df2 = pd.DataFrame(num_df2) ;

# df_3  = pd.concat([df_3, num_df2], axis = 1)
# # it worked!

# df_3 = df_3.drop(df.select_dtypes(include=numerics), axis = 1)
# df_3.shape # all observations but only 44 variables of numeric;

In [266]:

# # Perform PCA
# pca = PCA()
# pca.set_params(n_components = 30)   # 30 was arbitrary due to high dimensions...feel free to change
# pca.fit(num_df)


# # Transform/ project observations onto loading vectors
# num_df2 = pca.transform(num_df)


# # Join back with original data
# num_df2 = pd.DataFrame(num_df2)
# df_pc  = pd.concat([df, num_df2], axis = 1)


# # Drop continuous numeric features in favor of the PCs
# df = df_pc.drop(df.select_dtypes(include=numerics), axis = 1)
