In [100]:
from google.colab import drive
drive.mount('/content/gdrive')  

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import normalize, StandardScaler
from fancyimpute import SoftImpute, BiScaler, KNN

In [0]:
def drop_rows_with_full_nan(x_train_t, y_train):
  to_drop = []

  for idx, elem in list(zip(list(x_train_t.isna().sum(axis = 1).index), x_train_t.isna().sum(axis = 1))):
    if elem == x_train_t.shape[1] - 12:
      to_drop.append(idx)

  return x_train_t.drop(to_drop), y_train.drop(to_drop)

def engineer_features(data, start, end, drop = False):
  for i in range(start, end):
    prefix = 't' + str(i) + '_'
    gross_margin = np.where(data[prefix + 'cifra_de_afaceri_neta'] == 0, 0,
                            data[prefix + 'profit_brut'] / data[prefix + 'cifra_de_afaceri_neta'])
    data[prefix + "gross_margin"] = gross_margin

    profit_margin = np.where(data[prefix + 'cifra_de_afaceri_neta'] == 0, 0,
                                  data[prefix + 'profit_net'] / data[prefix + 'cifra_de_afaceri_neta'])
    data[prefix + "profit_margin"] = profit_margin

    total_assets = data[prefix + 'active_imobilizate'] + data[prefix + 'active_circulante']
    data[prefix + "total_assets"] = total_assets

    return_on_assets = np.where(data[prefix + 'total_assets'] == 0, 0,
                                data[prefix + 'profit_net'] / data[prefix + 'total_assets'])
    data[prefix + "return_on_assets"] = return_on_assets

    debt_ratio = np.where(data[prefix + 'total_assets'] == 0, 0,
                          data[prefix + 'datorii'] / data[prefix + 'total_assets'])
    data[prefix + "debt_ratio"] = debt_ratio

    return_on_net_assets = np.where(data[prefix + 'active_imobilizate'] == 0, 0,
                                    np.where(prefix + 'capital_social' == 0, 0, 
                                             data[prefix + 'profit_net'] / (data[prefix + 'active_imobilizate'] + data[prefix + 'capital_social'])))
    data[prefix + "return_on_net_assets"] = return_on_net_assets

    lichiditate_curenta = np.where(data[prefix + 'datorii'] == 0, 0,
                                  data[prefix + 'active_circulante'] / data[prefix + 'datorii'])
    data[prefix + 'lichiditate_curenta'] = lichiditate_curenta

    lichiditate_imediata = np.where(data[prefix + 'datorii'] == 0, 0,
                                    (data[prefix + 'active_circulante'] - data[prefix + 'stocuri']) / data[prefix + 'datorii'])
    data[prefix + 'lichiditate_imediata'] = lichiditate_imediata

    rentabilitate_economica = np.where(data[prefix + 'active_circulante'] == 0, 0,
                                       np.where(data[prefix + 'active_imobilizate'] == 0, 0, 
                                                data[prefix + 'profit_net'] * 100 / (data[prefix + 'active_circulante'] + data[prefix + 'active_imobilizate'])))
    data[prefix + 'rentabilitate_economica'] = rentabilitate_economica

    asset_turnover = np.where(data[prefix + 'total_assets'] == 0, 0, 
                              data[prefix + 'venituri_total'] / data[prefix + 'total_assets'])
    data[prefix + 'asset_turnover'] = asset_turnover

    if drop:
      continue

    return data

In [103]:
firme = pd.read_csv('/content/gdrive/My Drive/Firme.csv')
firmeii = pd.read_csv('/content/gdrive/My Drive/FirmeInInsolventa.csv')

t0_cols = [col for col in firme.columns if 't0' in col]
t4_cols = [col for col in firme.columns if 't4' in col]

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
firme = firme.replace(to_replace = '-', value = np.nan)
firmeii = firmeii.replace(to_replace = '-', value = np.nan)

### Fill NaN with mode for categorical features

In [0]:
for i in range(1, 5):
    prefix = 't' + str(i) + '_'
    for col in ['an', 'cod_caen', 'tip_activitate']:
      firme[prefix + col].fillna(firme[prefix + col].mode()[0], inplace = True)
      firmeii[prefix + col].fillna(firmeii[prefix + col].mode()[0], inplace = True)

firme['t0_an'].fillna(firme['t0_an'].mode()[0], inplace = True)
firme['t0_cod_caen'].fillna(firme['t0_cod_caen'].mode()[0], inplace = True)
firme['t0_tip_activitate'].fillna(firme['t0_tip_activitate'].mode()[0], inplace = True)

### Map string type categorical columns to numerical values

In [0]:
acts = list(set(list(firme['t0_tip_activitate']) + list(firme['t1_tip_activitate']) +\
                list(firme['t2_tip_activitate'])+ list(firme['t3_tip_activitate']) +\
                list(firme['t4_tip_activitate']) + list(firmeii['t1_tip_activitate']) +\
                list(firmeii['t2_tip_activitate']) + list(firmeii['t3_tip_activitate']) +\
                list(firmeii['t4_tip_activitate'])))

actmap = {}
i = 0
for act in acts:
    if act not in actmap:
        actmap[act] = i
        i += 1
        
firme['t0_tip_activitate'] = firme['t0_tip_activitate'].map(actmap)
firme['t1_tip_activitate'] = firme['t1_tip_activitate'].map(actmap)
firme['t2_tip_activitate'] = firme['t2_tip_activitate'].map(actmap)
firme['t3_tip_activitate'] = firme['t3_tip_activitate'].map(actmap)
firme['t4_tip_activitate'] = firme['t4_tip_activitate'].map(actmap)

firmeii['t1_tip_activitate'] = firmeii['t1_tip_activitate'].map(actmap)
firmeii['t2_tip_activitate'] = firmeii['t2_tip_activitate'].map(actmap)
firmeii['t3_tip_activitate'] = firmeii['t3_tip_activitate'].map(actmap)
firmeii['t4_tip_activitate'] = firmeii['t4_tip_activitate'].map(actmap)

### Apply a 60-30-10 train-val-test split and remove columns with a NaN count higher than 50%

In [0]:
insolventa = np.zeros(firme.shape[0])
insolventa_ii = np.ones(firmeii.shape[0])
firme['insolventa'] = insolventa

X, y = firme[list(firme.columns)[:-1]], firme['insolventa']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .1, shuffle = False)
x_test = x_test.drop(t4_cols, axis = 1)
x_train = pd.concat([x_train.drop(t0_cols, axis = 1), firmeii])
y_train = pd.concat([y_train, pd.Series(insolventa_ii)])

percent_missing = x_train.isnull().sum() * 100 / len(x_train)
missing_value_df = pd.DataFrame({'column_name': x_train.columns,
                                 'percent_missing': percent_missing})

x_train = x_train.drop(list(missing_value_df[missing_value_df['percent_missing'] > 50]['column_name']), axis = 1)

percent_missing = x_test.isnull().sum() * 100 / len(x_test)
missing_value_df = pd.DataFrame({'column_name': x_test.columns,
                                 'percent_missing': percent_missing})

x_test = x_test.drop(list(missing_value_df[missing_value_df['percent_missing'] > 50]['column_name']), axis = 1)

ss = ShuffleSplit(n_splits = 1, test_size = .4)
elem = [(list(i), list(j)) for i, j in ss.split(x_train, y_train)][0]
x_val = x_train[x_train.index.isin(elem[1])]
y_val = y_train[y_train.index.isin(elem[1])]
x_trian = x_train[x_train.index.isin(elem[0])]
y_val = y_train[y_train.index.isin(elem[1])]

x_train_init, y_train_init, x_val_init, y_val_init, x_test_init, y_test_init = x_train.copy(), y_train.copy(), x_val.copy(), y_val.copy(), x_test.copy(), y_test.copy()

In [0]:
y_train.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/y_train.csv', index = False)
y_val.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/y_val.csv', index = False)
y_test.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/y_test.csv', index = False)

## Impute and standardize/normalize data

### Mean imputation

In [0]:
x_train, y_train, x_val, y_val, x_test, y_test = x_train_init.copy(), y_train_init.copy(), x_val_init.copy(), y_val_init.copy(), x_test_init.copy(), y_test_init.copy()

#### Train data

In [0]:
t1_an, t2_an, t3_an, t4_an = x_train['t1_an'], x_train['t2_an'], x_train['t3_an'], x_train['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_train['t1_tip_activitate'], x_train['t2_tip_activitate'], \
                                                                              x_train['t3_tip_activitate'], x_train['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_train['t1_cod_caen'], x_train['t2_cod_caen'], x_train['t3_cod_caen'], x_train['t4_cod_caen']

t4_an = t4_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index']) 

x_train = x_train.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                                  't2_an', 't2_cod_caen', 't2_tip_activitate', 
                                  't3_an', 't3_cod_caen', 't3_tip_activitate',
                                  't4_an', 't4_cod_caen', 't4_tip_activitate'])

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
x_train_mean = pd.DataFrame(data = imputer.fit_transform(x_train), columns = x_train.columns)

In [0]:
scaler = StandardScaler()
x_train_mean_scaled = pd.DataFrame(data = scaler.fit_transform(x_train_mean), columns = x_train_mean.columns)

x_train_mean_scaled['t1_an'] = t1_an.to_numpy()
x_train_mean_scaled['t2_an'] = t2_an.to_numpy()
x_train_mean_scaled['t3_an'] = t3_an.to_numpy()
x_train_mean_scaled['t4_an'] = t4_an.to_numpy()

x_train_mean_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_train_mean_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_train_mean_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_train_mean_scaled['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_train_mean_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_train_mean_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_train_mean_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_train_mean_scaled['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_train_mean_scaled_fe = engineer_features(x_train_mean_scaled, 1, 5)
x_train_mean_scaled_fe = x_train_mean_scaled_fe.replace([np.inf], sys.maxsize)
x_train_mean_scaled_fe = x_train_mean_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_train_mean_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_mean_scaled_fe.csv', index = False)

In [0]:
x_train_mean_normalized = pd.DataFrame(data = normalize(x_train_mean), columns = x_train.columns)

x_train_mean_normalized['t1_an'] = t1_an.to_numpy()
x_train_mean_normalized['t2_an'] = t2_an.to_numpy()
x_train_mean_normalized['t3_an'] = t3_an.to_numpy()
x_train_mean_normalized['t4_an'] = t4_an.to_numpy()

x_train_mean_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_train_mean_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_train_mean_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_train_mean_normalized['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_train_mean_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_train_mean_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_train_mean_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_train_mean_normalized['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_train_mean_normalized_fe = engineer_features(x_train_mean_normalized, 1, 5)
x_train_mean_normalized_fe = x_train_mean_normalized_fe.replace([np.inf], sys.maxsize)
x_train_mean_normalized_fe = x_train_mean_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_train_mean_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_mean_normalized_fe.csv', index = False)

#### Validation data

In [0]:
t1_an, t2_an, t3_an, t4_an = x_val['t1_an'], x_val['t2_an'], x_val['t3_an'], x_val['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_val['t1_tip_activitate'], x_val['t2_tip_activitate'], \
                                                                              x_val['t3_tip_activitate'], x_val['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_val['t1_cod_caen'], x_val['t2_cod_caen'], x_val['t3_cod_caen'], x_val['t4_cod_caen']

t4_an = t4_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index']) 

x_val = x_val.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                              't2_an', 't2_cod_caen', 't2_tip_activitate', 
                              't3_an', 't3_cod_caen', 't3_tip_activitate',
                              't4_an', 't4_cod_caen', 't4_tip_activitate'])

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
x_val_mean = pd.DataFrame(data = imputer.fit_transform(x_val), columns = x_val.columns)

In [0]:
scaler = StandardScaler()
x_val_mean_scaled = pd.DataFrame(data = scaler.fit_transform(x_val_mean), columns = x_val_mean.columns)

x_val_mean_scaled['t1_an'] = t1_an.to_numpy()
x_val_mean_scaled['t2_an'] = t2_an.to_numpy()
x_val_mean_scaled['t3_an'] = t3_an.to_numpy()
x_val_mean_scaled['t4_an'] = t4_an.to_numpy()

x_val_mean_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_val_mean_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_val_mean_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_val_mean_scaled['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_val_mean_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_val_mean_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_val_mean_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_val_mean_scaled['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_val_mean_scaled_fe = engineer_features(x_val_mean_scaled, 1, 5)
x_val_mean_scaled_fe = x_val_mean_scaled_fe.replace([np.inf], sys.maxsize)
x_val_mean_scaled_fe = x_val_mean_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_val_mean_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_mean_scaled_fe.csv', index = False)

In [0]:
x_val_mean_normalized = pd.DataFrame(data = normalize(x_val_mean), columns = x_val.columns)

x_val_mean_normalized['t1_an'] = t1_an.to_numpy()
x_val_mean_normalized['t2_an'] = t2_an.to_numpy()
x_val_mean_normalized['t3_an'] = t3_an.to_numpy()
x_val_mean_normalized['t4_an'] = t4_an.to_numpy()

x_val_mean_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_val_mean_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_val_mean_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_val_mean_normalized['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_val_mean_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_val_mean_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_val_mean_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_val_mean_normalized['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_val_mean_normalized_fe = engineer_features(x_val_mean_normalized, 1, 5)
x_val_mean_normalized_fe = x_val_mean_normalized_fe.replace([np.inf], sys.maxsize)
x_val_mean_normalized_fe = x_val_mean_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_val_mean_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_mean_normalized_fe.csv', index = False)

#### Test data

In [0]:
t0_an, t1_an, t2_an, t3_an = x_test['t0_an'], x_test['t1_an'], x_test['t2_an'], x_test['t3_an']
t0_tip_activitate, t1_tip_activitate, t2_tip_activitate, t3_tip_activitate = x_test['t0_tip_activitate'], x_test['t1_tip_activitate'], \
                                                                              x_test['t2_tip_activitate'], x_test['t3_tip_activitate'], 
t0_cod_caen, t1_cod_caen, t2_cod_caen, t3_cod_caen = x_test['t0_cod_caen'], x_test['t1_cod_caen'], x_test['t2_cod_caen'], x_test['t3_cod_caen']

t0_an = t0_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t0_tip_activitate = t0_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t0_cod_caen = t0_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index'])

x_test = x_test.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                                't2_an', 't2_cod_caen', 't2_tip_activitate', 
                                't3_an', 't3_cod_caen', 't3_tip_activitate',
                                't0_an', 't0_cod_caen', 't0_tip_activitate'])

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
x_test_mean = pd.DataFrame(data = imputer.fit_transform(x_test), columns = x_test.columns)

In [0]:
scaler = StandardScaler()
x_test_mean_scaled = pd.DataFrame(data = scaler.fit_transform(x_test_mean), columns = x_test_mean.columns)

x_test_mean_scaled['t1_an'] = t1_an.to_numpy()
x_test_mean_scaled['t2_an'] = t2_an.to_numpy()
x_test_mean_scaled['t3_an'] = t3_an.to_numpy()
x_test_mean_scaled['t0_an'] = t0_an.to_numpy()

x_test_mean_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_test_mean_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_test_mean_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_test_mean_scaled['t0_tip_activitate'] = t0_tip_activitate.to_numpy()

x_test_mean_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_test_mean_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_test_mean_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_test_mean_scaled['t0_cod_caen'] = t0_cod_caen.to_numpy()

x_test_mean_scaled_fe = engineer_features(x_test_mean_scaled, 0, 4)
x_test_mean_scaled_fe = x_test_mean_scaled_fe.replace([np.inf], sys.maxsize)
x_test_mean_scaled_fe = x_test_mean_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_test_mean_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_mean_scaled_fe.csv', index = False)

In [0]:
x_test_mean_normalized = pd.DataFrame(data = normalize(x_test_mean), columns = x_test_mean.columns)

x_test_mean_normalized['t1_an'] = t1_an.to_numpy()
x_test_mean_normalized['t2_an'] = t2_an.to_numpy()
x_test_mean_normalized['t3_an'] = t3_an.to_numpy()
x_test_mean_normalized['t0_an'] = t0_an.to_numpy()

x_test_mean_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_test_mean_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_test_mean_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_test_mean_normalized['t0_tip_activitate'] = t0_tip_activitate.to_numpy()

x_test_mean_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_test_mean_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_test_mean_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_test_mean_normalized['t0_cod_caen'] = t0_cod_caen.to_numpy()

x_test_mean_normalized_fe = engineer_features(x_test_mean_normalized, 0, 4)
x_test_mean_normalized_fe = x_test_mean_normalized_fe.replace([np.inf], sys.maxsize)
x_test_mean_normalized_fe = x_test_mean_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_test_mean_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_mean_normalized_fe.csv', index = False)

### Median imputation

In [0]:
x_train, y_train, x_val, y_val, x_test, y_test = x_train_init.copy(), y_train_init.copy(), x_val_init.copy(), y_val_init.copy(), x_test_init.copy(), y_test_init.copy()

#### Train data

In [0]:
t1_an, t2_an, t3_an, t4_an = x_train['t1_an'], x_train['t2_an'], x_train['t3_an'], x_train['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_train['t1_tip_activitate'], x_train['t2_tip_activitate'], \
                                                                              x_train['t3_tip_activitate'], x_train['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_train['t1_cod_caen'], x_train['t2_cod_caen'], x_train['t3_cod_caen'], x_train['t4_cod_caen']

t4_an = t4_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index']) 

x_train = x_train.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                                  't2_an', 't2_cod_caen', 't2_tip_activitate', 
                                  't3_an', 't3_cod_caen', 't3_tip_activitate',
                                  't4_an', 't4_cod_caen', 't4_tip_activitate'])

imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
x_train_median = pd.DataFrame(data = imputer.fit_transform(x_train), columns = x_train.columns)

In [0]:
scaler = StandardScaler()
x_train_median_scaled = pd.DataFrame(data = scaler.fit_transform(x_train_median), columns = x_train_median.columns)

x_train_median_scaled['t1_an'] = t1_an.to_numpy()
x_train_median_scaled['t2_an'] = t2_an.to_numpy()
x_train_median_scaled['t3_an'] = t3_an.to_numpy()
x_train_median_scaled['t4_an'] = t4_an.to_numpy()

x_train_median_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_train_median_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_train_median_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_train_median_scaled['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_train_median_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_train_median_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_train_median_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_train_median_scaled['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_train_median_scaled_fe = engineer_features(x_train_median_scaled, 1, 5)
x_train_median_scaled_fe = x_train_median_scaled_fe.replace([np.inf], sys.maxsize)
x_train_median_scaled_fe = x_train_median_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_train_median_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_median_scaled_fe.csv', index = False)

In [0]:
x_train_median_normalized = pd.DataFrame(data = normalize(x_train_median), columns = x_train_median.columns)

x_train_median_normalized['t1_an'] = t1_an.to_numpy()
x_train_median_normalized['t2_an'] = t2_an.to_numpy()
x_train_median_normalized['t3_an'] = t3_an.to_numpy()
x_train_median_normalized['t4_an'] = t4_an.to_numpy()

x_train_median_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_train_median_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_train_median_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_train_median_normalized['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_train_median_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_train_median_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_train_median_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_train_median_normalized['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_train_median_normalized_fe = engineer_features(x_train_median_normalized, 1, 5)
x_train_median_normalized_fe = x_train_median_normalized_fe.replace([np.inf], sys.maxsize)
x_train_median_normalized_fe = x_train_median_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_train_median_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_median_normalized_fe.csv', index = False)

#### Validation data

In [0]:
t1_an, t2_an, t3_an, t4_an = x_val['t1_an'], x_val['t2_an'], x_val['t3_an'], x_val['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_val['t1_tip_activitate'], x_val['t2_tip_activitate'], \
                                                                              x_val['t3_tip_activitate'], x_val['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_val['t1_cod_caen'], x_val['t2_cod_caen'], x_val['t3_cod_caen'], x_val['t4_cod_caen']

t4_an = t4_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index']) 

x_val = x_val.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                              't2_an', 't2_cod_caen', 't2_tip_activitate', 
                              't3_an', 't3_cod_caen', 't3_tip_activitate',
                              't4_an', 't4_cod_caen', 't4_tip_activitate'])

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
x_val_median = pd.DataFrame(data = imputer.fit_transform(x_val), columns = x_val.columns)

In [0]:
scaler = StandardScaler()
x_val_median_scaled = pd.DataFrame(data = scaler.fit_transform(x_val_median), columns = x_val_median.columns)

x_val_median_scaled['t1_an'] = t1_an.to_numpy()
x_val_median_scaled['t2_an'] = t2_an.to_numpy()
x_val_median_scaled['t3_an'] = t3_an.to_numpy()
x_val_median_scaled['t4_an'] = t4_an.to_numpy()

x_val_median_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_val_median_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_val_median_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_val_median_scaled['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_val_median_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_val_median_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_val_median_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_val_median_scaled['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_val_median_scaled_fe = engineer_features(x_val_median_scaled, 1, 5)
x_val_median_scaled_fe = x_val_median_scaled_fe.replace([np.inf], sys.maxsize)
x_val_median_scaled_fe = x_val_median_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_val_median_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_median_scaled_fe.csv', index = False)

In [0]:
x_val_median_normalized = pd.DataFrame(data = normalize(x_val_mean), columns = x_val.columns)

x_val_median_normalized['t1_an'] = t1_an.to_numpy()
x_val_median_normalized['t2_an'] = t2_an.to_numpy()
x_val_median_normalized['t3_an'] = t3_an.to_numpy()
x_val_median_normalized['t4_an'] = t4_an.to_numpy()

x_val_median_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_val_median_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_val_median_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_val_median_normalized['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_val_median_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_val_median_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_val_median_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_val_median_normalized['t4_cod_caen'] = t4_cod_caen.to_numpy()

x_val_median_normalized_fe = engineer_features(x_val_median_normalized, 1, 5)
x_val_median_normalized_fe = x_val_median_normalized_fe.replace([np.inf], sys.maxsize)
x_val_median_normalized_fe = x_val_median_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_val_median_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_median_normalized_fe.csv', index = False)

#### Test data

In [0]:
t0_an, t1_an, t2_an, t3_an = x_test['t0_an'], x_test['t1_an'], x_test['t2_an'], x_test['t3_an']
t0_tip_activitate, t1_tip_activitate, t2_tip_activitate, t3_tip_activitate = x_test['t0_tip_activitate'], x_test['t1_tip_activitate'], \
                                                                              x_test['t2_tip_activitate'], x_test['t3_tip_activitate'], 
t0_cod_caen, t1_cod_caen, t2_cod_caen, t3_cod_caen = x_test['t0_cod_caen'], x_test['t1_cod_caen'], x_test['t2_cod_caen'], x_test['t3_cod_caen']

t0_an = t0_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t0_tip_activitate = t0_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t0_cod_caen = t0_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index'])

x_test = x_test.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                                't2_an', 't2_cod_caen', 't2_tip_activitate', 
                                't3_an', 't3_cod_caen', 't3_tip_activitate',
                                't0_an', 't0_cod_caen', 't0_tip_activitate'])

imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
x_test_median = pd.DataFrame(data = imputer.fit_transform(x_test), columns = x_test.columns)

In [0]:
scaler = StandardScaler()
x_test_median_scaled = pd.DataFrame(data = scaler.fit_transform(x_test_median), columns = x_test_median.columns)

x_test_median_scaled['t1_an'] = t1_an.to_numpy()
x_test_median_scaled['t2_an'] = t2_an.to_numpy()
x_test_median_scaled['t3_an'] = t3_an.to_numpy()
x_test_median_scaled['t0_an'] = t0_an.to_numpy()

x_test_median_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_test_median_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_test_median_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_test_median_scaled['t0_tip_activitate'] = t0_tip_activitate.to_numpy()

x_test_median_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_test_median_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_test_median_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_test_median_scaled['t0_cod_caen'] = t0_cod_caen.to_numpy()

x_test_median_scaled_fe = engineer_features(x_test_median_scaled, 0, 4)
x_test_median_scaled_fe = x_test_median_scaled_fe.replace([np.inf], sys.maxsize)
x_test_median_scaled_fe = x_test_median_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_test_median_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_median_scaled_fe.csv', index = False)

In [0]:
x_test_median_normalized = pd.DataFrame(data = normalize(x_test_mean), columns = x_test.columns)

x_test_median_normalized['t1_an'] = t1_an.to_numpy()
x_test_median_normalized['t2_an'] = t2_an.to_numpy()
x_test_median_normalized['t3_an'] = t3_an.to_numpy()
x_test_median_normalized['t0_an'] = t0_an.to_numpy()

x_test_median_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_test_median_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_test_median_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_test_median_normalized['t0_tip_activitate'] = t0_tip_activitate.to_numpy()

x_test_median_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_test_median_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_test_median_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_test_median_normalized['t0_cod_caen'] = t0_cod_caen.to_numpy()

x_test_median_normalized_fe = engineer_features(x_test_median_normalized, 0, 4)
x_test_median_normalized_fe = x_test_median_normalized_fe.replace([np.inf], sys.maxsize)
x_test_median_normalized_fe = x_test_median_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_test_median_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_median_normalized_fe.csv', index = False)

### KNN imputation

In [0]:
x_train, y_train, x_val, y_val, x_test, y_test = x_train_init.copy(), y_train_init.copy(), x_val_init.copy(), y_val_init.copy(), x_test_init.copy(), y_test_init.copy()

#### Train data

In [0]:
t1_an, t2_an, t3_an, t4_an = x_train['t1_an'], x_train['t2_an'], x_train['t3_an'], x_train['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_train['t1_tip_activitate'], x_train['t2_tip_activitate'], \
                                                                              x_train['t3_tip_activitate'], x_train['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_train['t1_cod_caen'], x_train['t2_cod_caen'], x_train['t3_cod_caen'], x_train['t4_cod_caen']

t4_an = t4_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index'])

x_train_t1 = x_train[[col for col in x_train.columns if 't1' in col]].drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate'])
x_train_t2 = x_train[[col for col in x_train.columns if 't2' in col]].drop(columns = ['t2_an', 't2_cod_caen', 't2_tip_activitate'])
x_train_t3 = x_train[[col for col in x_train.columns if 't3' in col]].drop(columns = ['t3_an', 't3_cod_caen', 't3_tip_activitate'])
x_train_t4 = x_train[[col for col in x_train.columns if 't4' in col]].drop(columns = ['t4_an', 't4_cod_caen', 't4_tip_activitate'])

In [0]:
x_train_t1_1_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t1[:30000])
x_train_t1_2_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t1[30000:60000])
x_train_t1_3_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t1[60000:90000])
x_train_t1_4_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t1[90000:])

x_train_t1 = pd.concat([pd.DataFrame(data = x_train_t1_1_knn, columns = x_train_t1.columns), 
                        pd.DataFrame(data = x_train_t1_2_knn, columns = x_train_t1.columns), 
                        pd.DataFrame(data = x_train_t1_3_knn, columns = x_train_t1.columns),
                        pd.DataFrame(data = x_train_t1_4_knn, columns = x_train_t1.columns)])

In [0]:
x_train_t2_1_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t2[:30000])
x_train_t2_2_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t2[30000:60000])
x_train_t2_3_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t2[60000:90000])
x_train_t2_4_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t2[90000:])

x_train_t2 = pd.concat([pd.DataFrame(data = x_train_t2_1_knn, columns = x_train_t2.columns), 
                        pd.DataFrame(data = x_train_t2_2_knn, columns = x_train_t2.columns), 
                        pd.DataFrame(data = x_train_t2_3_knn, columns = x_train_t2.columns),
                        pd.DataFrame(data = x_train_t2_4_knn, columns = x_train_t2.columns)])

In [0]:
x_train_t3_1_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t3[:30000])
x_train_t3_2_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t3[30000:60000])
x_train_t3_3_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t3[60000:90000])
x_train_t3_4_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t3[90000:])

x_train_t3 = pd.concat([pd.DataFrame(data = x_train_t3_1_knn, columns = x_train_t3.columns), 
                        pd.DataFrame(data = x_train_t3_2_knn, columns = x_train_t3.columns), 
                        pd.DataFrame(data = x_train_t3_3_knn, columns = x_train_t3.columns),
                        pd.DataFrame(data = x_train_t3_4_knn, columns = x_train_t3.columns)])

In [0]:
x_train_t4_1_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t4[:30000])
x_train_t4_2_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t4[30000:60000])
x_train_t4_3_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t4[60000:90000])
x_train_t4_4_knn = KNN(k = 3, verbose = False).fit_transform(x_train_t4[90000:])

x_train_t4 = pd.concat([pd.DataFrame(data = x_train_t4_1_knn, columns = x_train_t4.columns), 
                        pd.DataFrame(data = x_train_t4_2_knn, columns = x_train_t4.columns), 
                        pd.DataFrame(data = x_train_t4_3_knn, columns = x_train_t4.columns),
                        pd.DataFrame(data = x_train_t4_4_knn, columns = x_train_t4.columns)])

In [0]:
x_train_knn = pd.concat([x_train_t1, x_train_t2, x_train_t3, x_train_t4], axis = 1)

In [0]:
scaler = StandardScaler()

x_train_knn_scaled = pd.DataFrame(data = scaler.fit_transform(x_train_knn), columns = x_train_knn.columns)

x_train_knn_scaled['t1_an'] = t1_an.to_numpy()
x_train_knn_scaled['t2_an'] = t2_an.to_numpy()
x_train_knn_scaled['t3_an'] = t3_an.to_numpy()
x_train_knn_scaled['t4_an'] = t4_an.to_numpy()

x_train_knn_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_train_knn_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_train_knn_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_train_knn_scaled['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_train_knn_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_train_knn_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_train_knn_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_train_knn_scaled['t4_cod_caen'] = t4_cod_caen.to_numpy()

In [0]:
x_train_knn_scaled.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_knn_scaled.csv', index = False)

In [0]:
x_train_knn_normalized = pd.DataFrame(data = normalize(x_train_knn), columns = x_train_knn.columns)

x_train_knn_normalized['t1_an'] = t1_an.to_numpy()
x_train_knn_normalized['t2_an'] = t2_an.to_numpy()
x_train_knn_normalized['t3_an'] = t3_an.to_numpy()
x_train_knn_normalized['t4_an'] = t4_an.to_numpy()

x_train_knn_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_train_knn_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_train_knn_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_train_knn_normalized['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_train_knn_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_train_knn_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_train_knn_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_train_knn_normalized['t4_cod_caen'] = t4_cod_caen.to_numpy()

In [0]:
x_train_knn_normalized.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_knn_normalized.csv', index = False)

#### Validation data

In [0]:
t1_an, t2_an, t3_an, t4_an = x_val['t1_an'], x_val['t2_an'], x_val['t3_an'], x_val['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_val['t1_tip_activitate'], x_val['t2_tip_activitate'], \
                                                                              x_val['t3_tip_activitate'], x_val['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_val['t1_cod_caen'], x_val['t2_cod_caen'], x_val['t3_cod_caen'], x_val['t4_cod_caen']

t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])
t4_an = t4_an.reset_index().drop(columns = ['index'])

t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])
t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])

t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index'])
t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])

x_val_t1 = x_val[[col for col in x_val.columns if 't1' in col]].drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate'])
x_val_t2 = x_val[[col for col in x_val.columns if 't2' in col]].drop(columns = ['t2_an', 't2_cod_caen', 't2_tip_activitate'])
x_val_t3 = x_val[[col for col in x_val.columns if 't3' in col]].drop(columns = ['t3_an', 't3_cod_caen', 't3_tip_activitate'])
x_val_t4 = x_val[[col for col in x_val.columns if 't4' in col]].drop(columns = ['t4_an', 't4_cod_caen', 't4_tip_activitate'])

In [0]:
x_val_t1_1_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t1[:30000])
x_val_t1_2_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t1[30000:])

x_val_t1 = pd.concat([pd.DataFrame(data = x_val_t1_1_knn, columns = x_val_t1.columns), 
                      pd.DataFrame(data = x_val_t1_2_knn, columns = x_val_t1.columns)])

In [0]:
x_val_t2_1_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t2[:30000])
x_val_t2_2_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t2[30000:])

x_val_t2 = pd.concat([pd.DataFrame(data = x_val_t2_1_knn, columns = x_val_t2.columns), 
                      pd.DataFrame(data = x_val_t2_2_knn, columns = x_val_t2.columns)])

In [0]:
x_val_t3_1_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t3[:30000])
x_val_t3_2_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t3[30000:])

x_val_t3 = pd.concat([pd.DataFrame(data = x_val_t3_1_knn, columns = x_val_t3.columns), 
                      pd.DataFrame(data = x_val_t3_2_knn, columns = x_val_t3.columns)])

In [0]:
x_val_t4_1_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t4[:30000])
x_val_t4_2_knn = KNN(k = 3, verbose = False).fit_transform(x_val_t4[30000:])

x_val_t4 = pd.concat([pd.DataFrame(data = x_val_t4_1_knn, columns = x_val_t4.columns), 
                      pd.DataFrame(data = x_val_t4_2_knn, columns = x_val_t4.columns)])

In [0]:
x_val_knn = pd.concat([x_val_t1, x_val_t2, x_val_t3, x_val_t4], axis = 1)

In [0]:
scaler = StandardScaler()

x_val_knn_scaled = pd.DataFrame(data = scaler.fit_transform(x_val_knn), columns = x_val_knn.columns)

x_val_knn_scaled['t4_an'] = t4_an.to_numpy()
x_val_knn_scaled['t1_an'] = t1_an.to_numpy()
x_val_knn_scaled['t2_an'] = t2_an.to_numpy()
x_val_knn_scaled['t3_an'] = t3_an.to_numpy()

x_val_knn_scaled['t4_tip_activitate'] = t4_tip_activitate.to_numpy()
x_val_knn_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_val_knn_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_val_knn_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()

x_val_knn_scaled['t4_cod_caen'] = t4_cod_caen.to_numpy()
x_val_knn_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_val_knn_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_val_knn_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()

In [0]:
x_val_knn_scaled.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_knn_scaled.csv', index = False)

In [0]:
x_val_knn_normalized = pd.DataFrame(data = normalize(x_val_knn), columns = x_val_knn.columns)

x_val_knn_normalized['t4_an'] = t4_an.to_numpy()
x_val_knn_normalized['t1_an'] = t1_an.to_numpy()
x_val_knn_normalized['t2_an'] = t2_an.to_numpy()
x_val_knn_normalized['t3_an'] = t3_an.to_numpy()

x_val_knn_normalized['t4_tip_activitate'] = t4_tip_activitate.to_numpy()
x_val_knn_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_val_knn_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_val_knn_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()

x_val_knn_normalized['t4_cod_caen'] = t4_cod_caen.to_numpy()
x_val_knn_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_val_knn_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_val_knn_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()

In [0]:
x_val_knn_normalized.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_knn_normalized.csv', index = False)

#### Test data

In [0]:
t0_an, t1_an, t2_an, t3_an = x_test['t0_an'], x_test['t1_an'], x_test['t2_an'], x_test['t3_an']
t0_tip_activitate, t1_tip_activitate, t2_tip_activitate, t3_tip_activitate = x_test['t0_tip_activitate'], x_test['t1_tip_activitate'], \
                                                                              x_test['t2_tip_activitate'], x_test['t3_tip_activitate'], 
t0_cod_caen, t1_cod_caen, t2_cod_caen, t3_cod_caen = x_test['t0_cod_caen'], x_test['t1_cod_caen'], x_test['t2_cod_caen'], x_test['t3_cod_caen']

t0_an = t0_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t0_tip_activitate = t0_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t0_cod_caen = t0_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index'])

x_test_t0 = x_test[[col for col in x_test.columns if 't0' in col]].drop(columns = ['t0_an', 't0_cod_caen', 't0_tip_activitate'])
x_test_t1 = x_test[[col for col in x_test.columns if 't1' in col]].drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate'])
x_test_t2 = x_test[[col for col in x_test.columns if 't2' in col]].drop(columns = ['t2_an', 't2_cod_caen', 't2_tip_activitate'])
x_test_t3 = x_test[[col for col in x_test.columns if 't3' in col]].drop(columns = ['t3_an', 't3_cod_caen', 't3_tip_activitate'])

In [0]:
x_test_t0_knn = KNN(k = 3, verbose = False).fit_transform(x_test_t0)
x_test_t1_knn = KNN(k = 3, verbose = False).fit_transform(x_test_t1)
x_test_t2_knn = KNN(k = 3, verbose = False).fit_transform(x_test_t2)
x_test_t3_knn = KNN(k = 3, verbose = False).fit_transform(x_test_t3)

x_test_knn = pd.concat([pd.DataFrame(data = x_test_t0_knn, columns = x_test_t0.columns),
                        pd.DataFrame(data = x_test_t1_knn, columns = x_test_t1.columns),
                        pd.DataFrame(data = x_test_t2_knn, columns = x_test_t2.columns),
                        pd.DataFrame(data = x_test_t3_knn, columns = x_test_t3.columns)], axis = 1)

In [0]:
for elem in x_test_knn.isna().sum():
  if elem != 0:
    print(elem)

In [0]:
scaler = StandardScaler()

x_test_knn_scaled = pd.DataFrame(data = scaler.fit_transform(x_test_knn), columns = x_test_knn.columns)

x_test_knn_scaled['t0_an'] = t0_an.to_numpy()
x_test_knn_scaled['t1_an'] = t1_an.to_numpy()
x_test_knn_scaled['t2_an'] = t2_an.to_numpy()
x_test_knn_scaled['t3_an'] = t3_an.to_numpy()

x_test_knn_scaled['t0_tip_activitate'] = t0_tip_activitate.to_numpy()
x_test_knn_scaled['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_test_knn_scaled['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_test_knn_scaled['t3_tip_activitate'] = t3_tip_activitate.to_numpy()

x_test_knn_scaled['t0_cod_caen'] = t0_cod_caen.to_numpy()
x_test_knn_scaled['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_test_knn_scaled['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_test_knn_scaled['t3_cod_caen'] = t3_cod_caen.to_numpy()

In [0]:
x_test_knn_scaled.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_knn_scaled.csv', index = False)

In [0]:
x_test_knn = x_test_knn.replace([np.inf], sys.maxsize)
x_test_knn = x_test_knn.replace([-np.inf], -sys.maxsize-1)

x_test_knn_normalized = pd.DataFrame(data = normalize(x_test_knn), columns = x_test_knn.columns)

x_test_knn_normalized['t0_an'] = t0_an.to_numpy()
x_test_knn_normalized['t1_an'] = t1_an.to_numpy()
x_test_knn_normalized['t2_an'] = t2_an.to_numpy()
x_test_knn_normalized['t3_an'] = t3_an.to_numpy()

x_test_knn_normalized['t0_tip_activitate'] = t0_tip_activitate.to_numpy()
x_test_knn_normalized['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_test_knn_normalized['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_test_knn_normalized['t3_tip_activitate'] = t3_tip_activitate.to_numpy()

x_test_knn_normalized['t0_cod_caen'] = t0_cod_caen.to_numpy()
x_test_knn_normalized['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_test_knn_normalized['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_test_knn_normalized['t3_cod_caen'] = t3_cod_caen.to_numpy()

In [0]:
x_test_knn_normalized.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_knn_normalized.csv', index = False)

In [0]:
x_train_knn_scaled_fe = engineer_features(x_train_knn_scaled, 1, 5)
x_train_knn_scaled_fe = x_train_knn_scaled_fe.replace([np.inf], sys.maxsize)
x_train_knn_scaled_fe = x_train_knn_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_train_knn_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_knn_scaled_fe.csv', index = False)

x_val_knn_scaled_fe = engineer_features(x_val_knn_scaled, 1, 5)
x_val_knn_scaled_fe = x_val_knn_scaled_fe.replace([np.inf], sys.maxsize)
x_val_knn_scaled_fe = x_val_knn_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_val_knn_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_knn_scaled_fe.csv', index = False)

x_test_knn_scaled_fe = engineer_features(x_test_knn_scaled, 0, 4)
x_test_knn_scaled_fe = x_test_knn_scaled_fe.replace([np.inf], sys.maxsize)
x_test_knn_scaled_fe = x_test_knn_scaled_fe.replace([-np.inf], -sys.maxsize-1)
x_test_knn_scaled_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_knn_scaled_fe.csv', index = False)

x_train_knn_normalized_fe = engineer_features(x_train_knn_normalized, 1, 5)
x_train_knn_normalized_fe = x_train_knn_normalized_fe.replace([np.inf], sys.maxsize)
x_train_knn_normalized_fe = x_train_knn_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_train_knn_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_knn_normalized_fe.csv', index = False)

x_val_knn_normalized_fe = engineer_features(x_val_knn_normalized, 1, 5)
x_val_knn_normalized_fe = x_val_knn_normalized_fe.replace([np.inf], sys.maxsize)
x_val_knn_normalized_fe = x_val_knn_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_val_knn_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_knn_normalized_fe.csv', index = False)

x_test_knn_normalized_fe = engineer_features(x_test_knn_normalized, 0, 4)
x_test_knn_normalized_fe = x_test_knn_normalized_fe.replace([np.inf], sys.maxsize)
x_test_knn_normalized_fe = x_test_knn_normalized_fe.replace([-np.inf], -sys.maxsize-1)
x_test_knn_normalized_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_knn_normalized_fe.csv', index = False)

### Softimpute

In [0]:
x_train, y_train, x_val, y_val, x_test, y_test = x_train_init.copy(), y_train_init.copy(), x_val_init.copy(), y_val_init.copy(), x_test_init.copy(), y_test_init.copy()

#### Train data

In [0]:
x_train, y_train = drop_rows_with_full_nan(x_train, y_train)

t1_an, t2_an, t3_an, t4_an = x_train['t1_an'], x_train['t2_an'], x_train['t3_an'], x_train['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_train['t1_tip_activitate'], x_train['t2_tip_activitate'], \
                                                                              x_train['t3_tip_activitate'], x_train['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_train['t1_cod_caen'], x_train['t2_cod_caen'], x_train['t3_cod_caen'], x_train['t4_cod_caen']

t4_an = t4_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index'])

x_train = x_train.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                                  't2_an', 't2_cod_caen', 't2_tip_activitate', 
                                  't3_an', 't3_cod_caen', 't3_tip_activitate',
                                  't4_an', 't4_cod_caen', 't4_tip_activitate'])

train_cols = x_train.columns

In [0]:
x_train_norm = BiScaler(verbose = False).fit_transform(x_train.to_numpy().astype('float32'))
x_train_norm_soft_np = SoftImpute(verbose = False).fit_transform(x_train_norm)
x_train_norm_soft = pd.DataFrame(data = x_train_norm_soft_np, columns = train_cols)

x_train_norm_soft['t1_an'] = t1_an.to_numpy()
x_train_norm_soft['t2_an'] = t2_an.to_numpy()
x_train_norm_soft['t3_an'] = t3_an.to_numpy()
x_train_norm_soft['t4_an'] = t4_an.to_numpy()

x_train_norm_soft['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_train_norm_soft['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_train_norm_soft['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_train_norm_soft['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_train_norm_soft['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_train_norm_soft['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_train_norm_soft['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_train_norm_soft['t4_cod_caen'] = t4_cod_caen.to_numpy()

In [0]:
x_train_norm_soft_fe = engineer_features(x_train_norm_soft, 1, 5)
x_train_norm_soft_fe = x_train_norm_soft_fe.replace([np.inf], sys.maxsize)
x_train_norm_soft_fe = x_train_norm_soft_fe.replace([-np.inf], -sys.maxsize-1)
x_train_norm_soft_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_train_norm_soft_fe.csv', index = False)
y_train.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/y_train_norm_soft_fe.csv', index = False)

#### Validation data

In [0]:
x_val, y_val = drop_rows_with_full_nan(x_val, y_val)

t1_an, t2_an, t3_an, t4_an = x_val['t1_an'], x_val['t2_an'], x_val['t3_an'], x_val['t4_an']
t1_tip_activitate, t2_tip_activitate, t3_tip_activitate, t4_tip_activitate = x_val['t1_tip_activitate'], x_val['t2_tip_activitate'], \
                                                                              x_val['t3_tip_activitate'], x_val['t4_tip_activitate'], 
t1_cod_caen, t2_cod_caen, t3_cod_caen, t4_cod_caen = x_val['t1_cod_caen'], x_val['t2_cod_caen'], x_val['t3_cod_caen'], x_val['t4_cod_caen']

t4_an = t4_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t4_tip_activitate = t4_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t4_cod_caen = t4_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index']) 

x_val = x_val.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                              't2_an', 't2_cod_caen', 't2_tip_activitate', 
                              't3_an', 't3_cod_caen', 't3_tip_activitate',
                              't4_an', 't4_cod_caen', 't4_tip_activitate'])

val_cols = x_val.columns

In [0]:
x_val_norm = BiScaler(verbose = False).fit_transform(x_val.to_numpy().astype('float32'))
x_val_norm_soft_np = SoftImpute(verbose = False).fit_transform(x_val_norm)
x_val_norm_soft = pd.DataFrame(data = x_val_norm_soft_np, columns = val_cols)

x_val_norm_soft['t1_an'] = t1_an.to_numpy()
x_val_norm_soft['t2_an'] = t2_an.to_numpy()
x_val_norm_soft['t3_an'] = t3_an.to_numpy()
x_val_norm_soft['t4_an'] = t4_an.to_numpy()

x_val_norm_soft['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_val_norm_soft['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_val_norm_soft['t3_tip_activitate'] = t3_tip_activitate.to_numpy()
x_val_norm_soft['t4_tip_activitate'] = t4_tip_activitate.to_numpy()

x_val_norm_soft['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_val_norm_soft['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_val_norm_soft['t3_cod_caen'] = t3_cod_caen.to_numpy()
x_val_norm_soft['t4_cod_caen'] = t4_cod_caen.to_numpy()

In [0]:
x_val_norm_soft_fe = engineer_features(x_val_norm_soft, 1, 5)
x_val_norm_soft_fe = x_val_norm_soft_fe.replace([np.inf], sys.maxsize)
x_val_norm_soft_fe = x_val_norm_soft_fe.replace([-np.inf], -sys.maxsize-1)
x_val_norm_soft_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_val_norm_soft_fe.csv', index = False)
y_val.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/y_val_norm_soft_fe.csv', index = False)

#### Test data

In [0]:
x_test, y_test = drop_rows_with_full_nan(x_test, y_test)

t0_an, t1_an, t2_an, t3_an = x_test['t0_an'], x_test['t1_an'], x_test['t2_an'], x_test['t3_an']
t0_tip_activitate, t1_tip_activitate, t2_tip_activitate, t3_tip_activitate = x_test['t0_tip_activitate'], x_test['t1_tip_activitate'], \
                                                                              x_test['t2_tip_activitate'], x_test['t3_tip_activitate'], 
t0_cod_caen, t1_cod_caen, t2_cod_caen, t3_cod_caen = x_test['t0_cod_caen'], x_test['t1_cod_caen'], x_test['t2_cod_caen'], x_test['t3_cod_caen']

t0_an = t0_an.reset_index().drop(columns = ['index'])
t1_an = t1_an.reset_index().drop(columns = ['index'])
t2_an = t2_an.reset_index().drop(columns = ['index'])
t3_an = t3_an.reset_index().drop(columns = ['index'])

t0_tip_activitate = t0_tip_activitate.reset_index().drop(columns = ['index'])
t1_tip_activitate = t1_tip_activitate.reset_index().drop(columns = ['index'])
t2_tip_activitate = t2_tip_activitate.reset_index().drop(columns = ['index'])
t3_tip_activitate = t3_tip_activitate.reset_index().drop(columns = ['index'])

t0_cod_caen = t0_cod_caen.reset_index().drop(columns = ['index'])
t1_cod_caen = t1_cod_caen.reset_index().drop(columns = ['index'])
t2_cod_caen = t2_cod_caen.reset_index().drop(columns = ['index'])
t3_cod_caen = t3_cod_caen.reset_index().drop(columns = ['index'])


x_test = x_test.drop(columns = ['t1_an', 't1_cod_caen', 't1_tip_activitate', 
                                't2_an', 't2_cod_caen', 't2_tip_activitate', 
                                't3_an', 't3_cod_caen', 't3_tip_activitate',
                                't0_an', 't0_cod_caen', 't0_tip_activitate'])

test_cols = x_test.columns

In [0]:
x_test_norm = BiScaler(verbose = False).fit_transform(x_test.to_numpy().astype('float32'))
x_test_norm_soft = SoftImpute(verbose = False).fit_transform(x_test_norm)
x_test_norm_soft = pd.DataFrame(data = x_test_norm_soft, columns = test_cols)

x_test_norm_soft['t0_an'] = t0_an.to_numpy()
x_test_norm_soft['t1_an'] = t1_an.to_numpy()
x_test_norm_soft['t2_an'] = t2_an.to_numpy()
x_test_norm_soft['t3_an'] = t3_an.to_numpy()

x_test_norm_soft['t0_tip_activitate'] = t0_tip_activitate.to_numpy()
x_test_norm_soft['t1_tip_activitate'] = t1_tip_activitate.to_numpy()
x_test_norm_soft['t2_tip_activitate'] = t2_tip_activitate.to_numpy()
x_test_norm_soft['t3_tip_activitate'] = t3_tip_activitate.to_numpy()

x_test_norm_soft['t0_cod_caen'] = t0_cod_caen.to_numpy()
x_test_norm_soft['t1_cod_caen'] = t1_cod_caen.to_numpy()
x_test_norm_soft['t2_cod_caen'] = t2_cod_caen.to_numpy()
x_test_norm_soft['t3_cod_caen'] = t3_cod_caen.to_numpy()

In [0]:
x_test_norm_soft_fe = engineer_features(x_test_norm_soft, 0, 4)
x_test_norm_soft_fe = x_test_norm_soft_fe.replace([np.inf], sys.maxsize)
x_test_norm_soft_fe = x_test_norm_soft_fe.replace([-np.inf], -sys.maxsize-1)
x_test_norm_soft_fe.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/x_test_norm_soft_fe.csv', index = False)
y_test.to_csv('/content/gdrive/My Drive/An 1 Masters/Big Data/project data/y_test_norm_soft_fe.csv', index = False)