### Import libraries

In [1]:
import category_encoders as ce
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

### Read datasets

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

df = pd.concat([train, test]).reset_index(drop=True)

### Check for features containing NaN's

In [3]:
nan_features = df.columns[df.isna().any()].to_list()

if len(nan_features) == 0:
    print("There are no features containing NaN's")
else:
    print("Features containing NaN's:", *nan_features)

Features containing NaN's: payment_delay


The dataset is containing NaN's because test.csv is not containing this variable

### Data preprocessing

##### constant features

In [4]:
def get_constant_features(df):
    constant_features = []
    for column in df.columns:
        if len(df[column].value_counts(dropna=False).index) <= 1:
            constant_features.append(column)
    return constant_features

constant_features = get_constant_features(df)

if len(constant_features) == 0:
    print("There are no constant features")
else:
    df.drop(constant_features, axis=1, inplace=True)
    print(f"{len(constant_features)} features were dropped")

There are no constant features


##### numerical features

In [5]:
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
print(f"There are {len(numerical_features)} numerical features in our dataframe")

There are 15 numerical features in our dataframe


In [6]:
output_file = "data/numerical_features.json"

with open(output_file, 'w') as f:
    json.dump(numerical_features, f)

##### binary categorical features

In [7]:
def get_binary_features(df):
    binary_features = []
    for column in df.columns:
        if len(df[column].value_counts(dropna=False).index) == 2:
            binary_features.append(column)
    return binary_features

binary_features = get_binary_features(df)

if len(binary_features) == 0:
    print("There are no binary features")
else:
    print("Binary features:", *binary_features)

Binary features: international_plan voice_mail_plan


In [8]:
for i in binary_features:
    df[i] = np.where(df[i] == 'no', 0, 1)

##### object features

In [9]:
def get_object_features(df):
    object_features = []
    for column in df.columns:
        if df[column].dtype  == 'object':
            object_features.append(column)
    return object_features

object_features = get_object_features(df)
# Remove payment_delay, it is an object because it contains NaN's also
object_features.remove('payment_delay')

if len(object_features) == 0:
    print("There are no object features")
else:
    print("Object features:", *object_features)

Object features: state area_code


In [10]:
# We can proceed with one-hot encoding for area_code since this variable does not have a high cardinality
area_code_encoded = pd.get_dummies(df['area_code'], prefix='area_code')
df = pd.concat([df, area_code_encoded], axis=1)
df.drop('area_code', axis=1, inplace=True)

##### Split dataframes into training, validation and testing

In [11]:
train_df_temp, test_df = train_test_split(df, test_size=0.4, shuffle=False)
train_df, valid_df = train_test_split(train_df_temp, test_size=0.15, shuffle=True, random_state=42, stratify=train_df_temp['payment_delay'])

dfs = [train_df, valid_df, test_df]

In [12]:
# Frequency encoding and Target encoding for state variable because thee cardinality of this variable is pretty high
count_enc = ce.CountEncoder(normalize=True)
target_enc = ce.TargetEncoder(cols=['state'])

train_df['state_freq_encoded'] = count_enc.fit_transform(train_df['state'])
train_df.drop('state', axis=1, inplace=True)

for df in dfs[1:]:
    df['state_freq_encoded'] = count_enc.transform(df['state'])
    df.drop('state', axis=1, inplace=True)

In [13]:
train_set = pd.concat([train_df, valid_df]).reset_index(drop=True)

##### Convert dependent variable to numerical

In [14]:
train_set['payment_delay'] = np.where(train_set['payment_delay'] == 'no', 0, 1)

### Export preprocessed dataframes

In [15]:
train_set.to_csv('data/train_set.csv', index=False)
test_df.to_csv('data/test_set.csv', index=False)