In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [2]:
SUFFIX = '_recategorized' # ''
FILE_TO_LOAD = 'CHES2019_experts_recategorized.csv' #'CHES2019_experts_recategorized_4l3c4r.csv'#'CHES2019_experts.csv'#'CHES2019_experts.csv', 
FOLDER_PREFIX = 'recategorized/'#'base_data/' #'recategorized_4l3c4r/' # '' 

In [3]:
if os.path.exists('../data/base_data/' + FOLDER_PREFIX) == False:
    os.mkdir('../data/base_data/' + FOLDER_PREFIX)
    print('New folder created')

In [4]:
data_base = pd.read_csv('../data/' + FILE_TO_LOAD)

In [5]:
data_base.head()

Unnamed: 0,econ_interven,environment,redistribution,civlib_laworder,immigrate_policy,sociallifestyle,lrgen_factor,lrecon_factor,galtan_factor
0,5.0,5.0,8.0,6.0,7.0,5.0,c,c,c
1,0.0,5.0,1.0,7.0,5.0,8.0,,c,r
2,5.0,5.0,8.0,6.0,3.0,5.0,c,c,c
3,5.0,5.0,8.0,6.0,6.0,5.0,c,r,r
4,2.0,5.0,8.0,9.0,10.0,8.0,c,r,r


In [6]:
data_base.describe()

Unnamed: 0,econ_interven,environment,redistribution,civlib_laworder,immigrate_policy,sociallifestyle
count,3432.0,3262.0,3396.0,3405.0,3438.0,3461.0
mean,4.548951,5.188228,4.409894,5.151542,5.681792,4.473851
std,2.63232,2.647456,2.533628,2.930842,2.936724,3.266881
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,3.0,3.0,3.0,3.0,2.0
50%,5.0,5.0,4.0,5.0,6.0,4.0
75%,6.0,7.0,6.0,8.0,8.0,7.0
max,10.0,10.0,10.0,10.0,10.0,10.0


In [7]:
data_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3823 entries, 0 to 3822
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   econ_interven     3432 non-null   float64
 1   environment       3262 non-null   float64
 2   redistribution    3396 non-null   float64
 3   civlib_laworder   3405 non-null   float64
 4   immigrate_policy  3438 non-null   float64
 5   sociallifestyle   3461 non-null   float64
 6   lrgen_factor      3610 non-null   object 
 7   lrecon_factor     3554 non-null   object 
 8   galtan_factor     3596 non-null   object 
dtypes: float64(6), object(3)
memory usage: 268.9+ KB


In [8]:
# Obtain an overview of missing values and respective removal
for i in range(0, 7):
    print(str(i) + ': ' + str(len(data_base.dropna(subset = data_base.iloc[:,:-3].columns.values,thresh = i))))

0: 3823
1: 3649
2: 3599
3: 3535
4: 3425
5: 3274
6: 2912


# Preprocessing
Procedure:
- ~~remove all rows where NAN's occur for independent variables (0-5) # where more than 2 (>=3) NAN's occur for independent variables (0-5)~~
- remove all rows where NAN's occur for independent variables (0-5) # where NAN's occur for independent variables (0-5)
- Split the dataset into subsets according to to-be-predicted columns lrgen, lrecon, galtan
- Remove NA's and non-required columns in these sub-datasets for these columns
- Subsequently, split the datasets into validation and training-test dataset
- The remaining NA's of the independent variables will then be imputed using R

In [12]:
# ~~remove rows with excessive number of NAN's - at least 4 values out of 6 must be provided~~
# remove rows with excessive number of NAN's - all values out of 6 must be provided
data_base = data_base.dropna(subset = data_base.iloc[:,:-3].columns.values,thresh = 6)

In [10]:
# select only columns with values and drop non-required columns
data_lrgen_base = data_base[data_base['lrgen_factor'].isnull()==False].drop(['lrecon_factor', 'galtan_factor'], axis=1)
data_lrecon_base = data_base[data_base['lrecon_factor'].isnull()==False].drop(['lrgen_factor', 'galtan_factor'], axis=1)
data_galtan_base = data_base[data_base['galtan_factor'].isnull()==False].drop(['lrecon_factor', 'lrgen_factor'], axis=1)

In [14]:
# parameters
test_size = 0.1 # out of total dataset
train_size = 0.9 # out of total dataset - test dataset

# validation and training can be combined due to cross validation in the training phase

In [15]:
# lrgen-data
X = data_lrgen_base.iloc[:, :-1]
y = data_lrgen_base.iloc[:, -1]
prefix = 'data_lrgen_base_'

# Split into Training / Validation and Test Dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=train_size, random_state=34)

X_train.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'X_train' + SUFFIX + '.csv')
y_train.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'y_train' + SUFFIX + '.csv')
X_test.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'X_test' + SUFFIX + '.csv')
y_test.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'y_test' + SUFFIX + '.csv')

In [16]:
# lrecon-data
X = data_lrecon_base.iloc[:, :-1]
y = data_lrecon_base.iloc[:, -1]
prefix = 'data_lrecon_base_'

# Split into Training / Validation and Test Dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=train_size, random_state=34)

X_train.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'X_train' + SUFFIX + '.csv')
y_train.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'y_train' + SUFFIX + '.csv')
X_test.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'X_test' + SUFFIX + '.csv')
y_test.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'y_test' + SUFFIX + '.csv')

In [17]:
# galtan-data
X = data_galtan_base.iloc[:, :-1]
y = data_galtan_base.iloc[:, -1]
prefix = 'data_galtan_base_'

# Split into Training / Validation and Test Dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=train_size, random_state=34)

X_train.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'X_train' + SUFFIX + '.csv')
y_train.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'y_train' + SUFFIX + '.csv')
X_test.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'X_test' + SUFFIX + '.csv')
y_test.to_csv('../data/base_data/' + FOLDER_PREFIX + prefix + 'y_test' + SUFFIX + '.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2889 entries, 0 to 3816
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   econ_interven     2889 non-null   float64
 1   environment       2889 non-null   float64
 2   redistribution    2889 non-null   float64
 3   civlib_laworder   2889 non-null   float64
 4   immigrate_policy  2889 non-null   float64
 5   sociallifestyle   2889 non-null   float64
dtypes: float64(6)
memory usage: 158.0 KB
