In [1]:
!pip install pandas
!pip install openml
!pip install nbconvert
!brew install pandoc
!pip install seaborn

!pip install sklearn

You should consider upgrading via the '/Users/nikolaus/Documents/Uni/TU/Machine Learning/2_nn_implementation/venv/bin/python -m pip install --upgrade pip' command.[0m
Collecting openml
  Using cached openml-0.14.1-py3-none-any.whl
Collecting liac-arff>=2.4.0
  Using cached liac_arff-2.5.0-py3-none-any.whl
Collecting xmltodict
  Using cached xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Collecting pyarrow
  Downloading pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl (26.9 MB)
     |████████████████████████████████| 26.9 MB 2.7 MB/s            
Collecting minio
  Downloading minio-7.2.0-py3-none-any.whl (83 kB)
     |████████████████████████████████| 83 kB 983 kB/s            
Collecting pycryptodome
  Downloading pycryptodome-3.19.0-cp35-abi3-macosx_10_9_x86_64.whl (1.6 MB)
     |████████████████████████████████| 1.6 MB 3.2 MB/s            
Installing collected packages: pycryptodome, xmltodict, pyarrow, minio, liac-arff, openml
Successfully installed liac-arff-2.

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import minmax_scale, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split



# Preprocessing

In [2]:
# general preprocessor
# with MinMaxScaler
preprocessor = Pipeline(
   steps=[
       #("imputation", KNNImputer(n_neighbors=2, weights="uniform")),
       ("scaler", MinMaxScaler())
    ]
)
# without scaling
preprocessor_no_scaling = Pipeline(
   steps=[
       ("imputation", KNNImputer(n_neighbors=2, weights="uniform"))
    ]
)

In [3]:
# shuffling two objects in the same way
# input: two pandas objects whose orders correspond
# output: the input objects, but with shuffled order, but their orders still correspond
def shuffle(*objects):
    np.random.seed(42)
    new_index = np.random.permutation(objects[0].index)
    objects = [obj.reindex(new_index) for obj in objects]
    if len(objects) == 1: return objects[0]
    return objects


## Dataset 1: Biomed data on muscular dystropy

In [5]:
with open('data/biomed/biomed_x.pickle', 'rb') as handle:
    x_biomed = pickle.load(handle)
with open('data/biomed/biomed_y.pickle', 'rb') as handle:
    y_biomed = pickle.load(handle)

In [6]:
print(y_biomed)
# recoding the labels to be 0 or 1
# 0: normal
# 1: carrier
y_biomed.replace({"carrier": 1, "normal": 0}, inplace=True)
y_biomed

30     1
171    0
84     0
198    0
60     1
      ..
106    0
14     1
92     0
179    0
102    0
Name: class, Length: 209, dtype: category
Categories (2, int64): [1 < 0]


30     1
171    0
84     0
198    0
60     1
      ..
106    0
14     1
92     0
179    0
102    0
Name: class, Length: 209, dtype: category
Categories (2, int64): [1 < 0]

In [16]:
x_biomed = x_biomed.drop(["Observation_number", "Hospital_identification_number_for_blood_sample", "Date_that_blood_sample_was_taken"], 
                         axis=1)
x_biomed

Unnamed: 0,Age_of_patient,ml,m2,m3,m4
0,30,167.0,89.0,25.6,364.0
1,41,104.0,81.0,26.8,245.0
2,22,30.0,108.0,8.8,284.0
3,22,44.0,104.0,17.4,172.0
4,20,65.0,87.0,23.8,198.0
...,...,...,...,...,...
204,32,41.0,78.5,10.9,191.0
205,32,43.0,87.5,6.0,136.0
206,32,30.0,90.5,15.3,136.0
207,33,30.0,85.0,11.4,176.0


In [17]:
# shuffle the data
x_biomed, y_biomed = shuffle(x_biomed, y_biomed)

In [9]:
biomed = pd.concat([x_biomed, y_biomed], axis=1)
biomed = biomed.dropna()
y_biomed = biomed.pop("class")
x_biomed = biomed
biomed

Unnamed: 0,Age_of_patient,ml,m2,m3,m4
0,0.365854,0.016496,0.700000,0.176309,0.459203
1,0.048780,0.028280,0.678571,0.090909,0.157495
2,0.146341,0.090338,0.551190,0.131313,0.275142
3,0.121951,0.017282,0.761905,0.124885,0.250474
4,0.780488,0.142969,0.684524,0.205693,0.322581
...,...,...,...,...,...
204,0.341463,0.056559,0.503571,0.029385,0.039848
205,0.170732,0.007855,0.678571,0.425161,0.271347
206,0.268293,0.018068,0.738095,0.089991,0.174573
207,0.390244,0.019639,0.460714,0.038567,0.123340


In [10]:
# in total 15 rows have NA values in either m2 or o4
# apply preprocessing, to both the test and training set
#x_biomed_preprocessed = pd.DataFrame(preprocessor.fit_transform(x_biomed))
#x_biomed_preprocessed.columns = x_biomed.columns
#x_biomed_preprocessed

with open('data/biomed/biomed_x.pickle', 'wb') as handle:
        pickle.dump(x_biomed, handle)
with open('data/biomed/biomed_y.pickle', 'wb') as handle:
        pickle.dump(y_biomed, handle)




In [12]:
print(
    "Training features: ", x_biomed_train.shape, "\n",
    "Training labels: ", y_biomed_train.shape, "\n",
    "Test features: ", x_biomed_test.shape, "\n",
    "Test labels: ", y_biomed_test.shape, "\n",
    sep=""
)

Training features: (167, 5)
Training labels: (167,)
Test features: (42, 5)
Test labels: (42,)



In [13]:
with open('data/biomed/preprocessed_biomed_data.pickle', 'wb') as handle:
        pickle.dump(x_biomed_train, handle)
        pickle.dump(x_biomed_train_preprocessed, handle)
        pickle.dump(x_biomed_train_preprocessed_unscaled, handle)
        pickle.dump(y_biomed_train, handle)
        pickle.dump(x_biomed_test, handle)
        pickle.dump(x_biomed_test_preprocessed, handle)
        pickle.dump(x_biomed_test_preprocessed_unscaled, handle)
        pickle.dump(y_biomed_test, handle)

## Dataset 2: Fertility

In [11]:
with open('data/fertility/x_fert.pickle', 'rb') as handle:
    x_fert = pickle.load(handle)
with open('data/fertility/y_fert.pickle', 'rb') as handle:
    y_fert = pickle.load(handle)

# shuffle the data, just in case
x_fert, y_fert = shuffle(x_fert, y_fert)

FileNotFoundError: [Errno 2] No such file or directory: 'data/fertility/x_fert.pickle'

In [15]:
y_fert

83    0
53    0
70    1
45    0
44    0
     ..
60    0
71    0
14    0
92    0
51    0
Name: fertility, Length: 100, dtype: category
Categories (2, int64): [0 < 1]

In [16]:
x_fert.columns

Index(['season', 'age', 'child_diseases', 'accident', 'surgery', 'high_fever',
       'alcohol', 'smoking', 'hours_sitting', 'fall', 'spring', 'summer',
       'winter', 'fever_never', 'fever_not_recent', 'fever_recent',
       'smoking_daily', 'smoking_never', 'smoking_occasionally',
       'alcohol_daily', 'alcohol_rarely_or_never', 'alcohol_several_daily',
       'alcohol_several_weekly', 'alcohol_weekly'],
      dtype='object')

In [17]:

# drop all columns that are not numeric or 1-hot
x_fert = x_fert.drop(["season", "high_fever", "alcohol", "smoking"], axis=1)

In [18]:
x_fert

Unnamed: 0,age,child_diseases,accident,surgery,hours_sitting,fall,spring,summer,winter,fever_never,fever_not_recent,fever_recent,smoking_daily,smoking_never,smoking_occasionally,alcohol_daily,alcohol_rarely_or_never,alcohol_several_daily,alcohol_several_weekly,alcohol_weekly
83,0.86,0,0,0,0.25,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0
53,0.58,0,0,0,0.19,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1
70,0.50,0,0,1,0.88,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1
45,0.53,0,1,1,0.44,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0
44,0.53,0,0,1,0.31,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.56,1,1,1,1.00,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1
71,0.69,0,1,1,0.31,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0
14,0.94,0,0,0,0.25,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
92,0.75,0,0,1,0.38,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1


In [19]:
x_fert.describe()

Unnamed: 0,age,child_diseases,accident,surgery,hours_sitting,fall,spring,summer,winter,fever_never,fever_not_recent,fever_recent,smoking_daily,smoking_never,smoking_occasionally,alcohol_daily,alcohol_rarely_or_never,alcohol_several_daily,alcohol_several_weekly,alcohol_weekly
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.669,0.13,0.56,0.49,0.4068,0.31,0.37,0.04,0.28,0.28,0.63,0.09,0.21,0.56,0.23,0.01,0.4,0.01,0.19,0.39
std,0.121319,0.337998,0.498888,0.502418,0.186395,0.464823,0.485237,0.196946,0.451261,0.451261,0.485237,0.287623,0.40936,0.498888,0.422953,0.1,0.492366,0.1,0.394277,0.490207
min,0.5,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.56,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.67,0.0,1.0,0.0,0.38,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.75,0.0,1.0,1.0,0.5,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


now everything is 1-hot-encoded

In [20]:
# data splitting
# first join x and y back together
x_fert["y"]=y_fert
# split the whole data into train and test set
x_fert_train, x_fert_test = train_test_split(x_fert, test_size=0.2, random_state=0, stratify=x_fert.y)
# separate the class labels to be their own series again
y_fert_train = x_fert_train.pop("y")
y_fert_test = x_fert_test.pop("y")

In [21]:
print(
    "Training features: ", x_fert_train.shape, "\n",
    "Training labels: ", y_fert_train.shape, "\n",
    "Test features: ", x_fert_test.shape, "\n",
    "Test labels: ", y_fert_test.shape, "\n",
    sep=""
)

Training features: (80, 20)
Training labels: (80,)
Test features: (20, 20)
Test labels: (20,)



In [22]:
with open('data/fertility/preprocessed_fertility_data.pickle', 'wb') as handle:
        pickle.dump(x_fert_train, handle)
        pickle.dump(x_fert_test, handle)
        pickle.dump(y_fert_train, handle)
        pickle.dump(y_fert_test, handle)

## Dataset 3: Amazon Reviews

In [23]:
reviews_train = pd.read_csv('data/reviews/amazon_review_ID.shuf.lrn.csv')  
reviews_test = pd.read_csv('data/reviews/amazon_review_ID.shuf.tes.csv')  

In [24]:
y_reviews_train = reviews_train.Class

In [25]:
y_reviews_train.nunique()

50

In [26]:
y_reviews_train.value_counts()

Class
Calvinnme      21
Walters        21
Nigam          20
Wilson         20
Davisson       19
Chachra        19
Sherwin        18
Vernon         18
Kolln          18
Power          18
Lee            17
Brody          17
Koenig         17
Neal           17
Hayes          17
Harp           17
CFH            17
Auken          17
Brown          16
Grove          16
Engineer       15
Lawyeraau      15
Shea           15
Mitchell       15
Mark           15
Cutey          15
Chell          14
Taylor         14
Peterson       14
Ashbacher      14
Cholette       14
Janson         14
Chandler       14
Mahlers2nd     14
Riley          14
Agresti        13
Messick        13
Lovitt         13
Bukowsky       12
Vision         12
Corn           12
Robert         12
Merritt        12
Blankenship    12
Dent           11
Johnson        11
Goonan         11
Morrison       11
Comdet         10
McKee           9
Name: count, dtype: int64

In [27]:
x_reviews_train = reviews_train.drop(["ID", "Class"], axis=1)
x_reviews_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V9991,V9992,V9993,V9994,V9995,V9996,V9997,V9998,V9999,V10000
0,14,9,8,12,7,4,5,3,2,2,...,2,0,1,0,1,0,1,1,0,0
1,12,3,8,6,5,3,2,3,2,0,...,0,0,0,0,0,0,0,0,0,0
2,17,5,7,12,8,3,0,3,2,3,...,0,0,1,0,0,0,0,1,0,0
3,9,6,10,6,4,7,7,3,3,3,...,0,0,0,0,0,2,0,0,0,0
4,8,4,5,5,1,7,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,9,4,3,4,6,7,2,1,0,1,...,0,1,1,1,2,0,0,0,0,0
746,13,6,1,13,2,5,7,5,3,1,...,0,0,0,0,1,0,0,0,1,0
747,19,9,12,3,9,7,5,2,5,2,...,2,1,0,0,0,0,0,0,1,0
748,5,14,16,7,2,12,9,3,7,4,...,0,0,0,0,0,0,0,0,3,0


In [28]:
x_reviews_train.isna().sum().sum()

0

In [29]:
reviews_test_ID = reviews_test['ID']
reviews_test = reviews_test.drop("ID", axis=1)
reviews_test

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V9991,V9992,V9993,V9994,V9995,V9996,V9997,V9998,V9999,V10000
0,12,8,9,10,9,4,0,3,2,1,...,0,0,0,0,0,0,0,0,0,0
1,7,6,2,5,1,7,9,5,4,11,...,4,0,0,0,0,0,0,0,0,0
2,9,7,3,7,5,3,1,2,5,1,...,0,0,0,0,0,1,0,1,0,0
3,7,7,6,6,5,2,14,0,9,8,...,0,0,0,0,0,0,0,0,1,0
4,26,13,11,15,6,4,3,5,4,1,...,0,1,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,14,5,4,11,7,4,2,3,2,0,...,0,0,0,0,0,1,0,0,0,0
746,13,13,6,3,12,5,4,2,4,10,...,0,0,1,0,0,1,0,0,2,0
747,12,9,2,4,2,4,2,8,5,1,...,0,0,0,0,0,0,0,0,0,0
748,7,5,4,7,10,2,1,0,0,2,...,0,0,0,0,0,0,0,0,0,0


all variables appear to be some sort of rating, I guess either number of views (streams/clicks) or a certain number given as a rating. its not a likert rating, as the numbers per product even go up to over 30. 
if we had any evidence that the ratings were a fixed scale we could use the numbers as is, but since we do ot have that I guess it might be wise to scale the data (but make the step optional just in case)

Im doing mix-max-scaling
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.minmax_scale.html#sklearn.preprocessing.minmax_scale

In [30]:
# shuffle the data
x_reviews_train, y_reviews_train = shuffle(x_reviews_train, y_reviews_train)


In [31]:
# preprocessing

# apply the preprocessing pipeline, to both the training and test data
x_reviews_train_preprocessed = pd.DataFrame(preprocessor.fit_transform(x_reviews_train))
# salvage the column names
x_reviews_train_preprocessed.columns = x_reviews_train.columns
x_reviews_train_preprocessed

# apply the preprocessing pipeline, to both the training and test data
x_reviews_test_preprocessed = pd.DataFrame(preprocessor.fit_transform(reviews_test))
# salvage the column names
x_reviews_test_preprocessed.columns = reviews_test.columns

#unscaled preprocessing
x_reviews_train_preprocessed_unscaled = pd.DataFrame(preprocessor_no_scaling.fit_transform(x_reviews_train))
x_reviews_train_preprocessed_unscaled.columns = x_reviews_train.columns

x_reviews_test_preprocessed_unscaled = pd.DataFrame(preprocessor_no_scaling.fit_transform(reviews_test))
x_reviews_test_preprocessed_unscaled.columns = reviews_test.columns

# preview
x_reviews_train_preprocessed

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V9991,V9992,V9993,V9994,V9995,V9996,V9997,V9998,V9999,V10000
0,0.28125,0.428571,0.40,0.08,0.291667,0.333333,0.000000,0.500000,0.230769,0.263158,...,0.00,0.000000,0.00,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.0
1,0.40625,0.523810,0.25,0.52,0.166667,0.200000,0.111111,0.214286,0.153846,0.000000,...,0.25,0.000000,0.00,0.0,0.0,0.00,0.000000,0.000000,0.333333,0.0
2,0.53125,0.285714,0.20,0.44,0.458333,0.333333,0.611111,0.500000,0.692308,0.105263,...,0.00,0.000000,0.00,0.2,0.0,0.00,0.333333,0.000000,0.000000,0.0
3,0.50000,0.428571,0.90,0.68,0.083333,0.666667,0.000000,0.571429,0.384615,0.105263,...,0.00,0.000000,0.00,0.0,0.0,0.25,0.000000,0.000000,0.000000,0.0
4,0.09375,0.190476,0.15,0.04,0.208333,0.133333,0.500000,0.071429,0.307692,0.052632,...,0.25,0.000000,0.00,0.0,0.0,0.00,0.000000,0.333333,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0.18750,0.190476,0.05,0.08,0.166667,0.066667,0.000000,0.000000,0.076923,0.052632,...,0.00,0.000000,0.00,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.0
746,0.37500,0.428571,0.30,0.40,0.250000,0.400000,0.000000,0.285714,0.076923,0.105263,...,0.00,0.000000,0.25,0.0,0.0,0.00,0.333333,0.000000,0.000000,0.0
747,0.34375,0.095238,0.10,0.20,0.208333,0.133333,0.333333,0.214286,0.384615,0.052632,...,0.00,0.166667,0.00,0.0,0.0,0.25,0.000000,0.000000,0.000000,0.0
748,0.28125,0.571429,0.55,0.28,0.125000,0.533333,0.388889,0.214286,0.384615,0.210526,...,0.00,0.000000,0.00,0.0,0.0,0.00,0.000000,0.000000,0.333333,0.0


In [32]:
print(
    "Training features: ", x_reviews_train_preprocessed.shape, "\n",
    "Training labels: ", y_reviews_train.shape, "\n",
    "Test features: ", x_reviews_test_preprocessed.shape, "\n",
    sep=""
)

Training features: (750, 10000)
Training labels: (750,)
Test features: (750, 10000)



In [33]:
with open('data/reviews/preprocessed_reviews_data.pickle', 'wb') as handle:
        pickle.dump(x_reviews_train, handle)
        pickle.dump(x_reviews_train_preprocessed, handle)
        pickle.dump(x_reviews_train_preprocessed_unscaled, handle)
        pickle.dump(y_reviews_train, handle)
        pickle.dump(x_reviews_test_preprocessed, handle)
        pickle.dump(x_reviews_test_preprocessed_unscaled, handle)
        pickle.dump(reviews_test_ID, handle)

## Dataset 4: Congress

In [34]:
congress_train = pd.read_csv("data/congress/CongressionalVotingID.shuf.lrn.csv")
sample_solution = pd.read_csv("data/congress/CongressionalVotingID.shuf.sol.ex.csv")
x_congress_test = pd.read_csv("data/congress/CongressionalVotingID.shuf.tes.csv")

In [35]:
congress_train
#the test set looks the same, just smaller and no class column

Unnamed: 0,ID,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,249,republican,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
1,140,democrat,y,n,y,n,n,y,y,y,y,n,n,n,n,n,y,y
2,383,democrat,y,y,n,y,y,y,n,n,n,n,y,n,y,y,n,unknown
3,201,democrat,n,n,y,n,n,n,y,y,y,n,n,n,n,y,y,y
4,297,republican,n,n,y,y,y,y,n,n,n,y,n,y,y,y,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,106,democrat,y,y,y,n,n,n,n,y,y,n,y,n,n,n,y,y
214,271,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
215,275,republican,y,n,n,y,y,n,y,n,n,y,n,n,n,y,y,y
216,354,republican,n,n,y,y,y,y,y,n,n,y,y,y,y,y,n,y


In [36]:
#Extracting independent variables
y_congress_train = congress_train['class']

In [37]:
#Extract dependent variable
x_congress_train = congress_train.drop(["ID", "class"], axis=1)
congress_test_ID=pd.DataFrame(x_congress_test['ID'])
x_congress_test = x_congress_test.drop(["ID"], axis=1)
congress_test_ID

Unnamed: 0,ID
0,102
1,413
2,30
3,298
4,95
...,...
212,179
213,256
214,278
215,264


In [38]:
y_congress_train.nunique()

2

In [39]:
y_congress_train.value_counts()

class
democrat      137
republican     81
Name: count, dtype: int64

In [40]:
x_congress_summary = x_congress_train.describe()
x_congress_summary

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
count,218,218,218,218,218,218,218,218,218,218,218,218,218,218,218,218
unique,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
top,n,n,y,n,n,y,y,y,y,n,n,n,n,y,n,y
freq,110,99,131,130,110,131,126,128,105,108,134,125,103,112,108,130


In [41]:
total_unknown_rows = x_congress_train[x_congress_train.eq('unknown').any(axis=1)].shape[0]
print(total_unknown_rows)

107


In [42]:
# shuffling the data
x_congress_train, y_congress_train = shuffle(x_congress_train, y_congress_train)
x_congress_test,congress_test_ID = shuffle(x_congress_test,congress_test_ID)

# Replacing all "unknown" values with the mode of the given column
x_congress_train.replace('unknown', pd.NA, inplace=True)
x_congress_test.replace('unknown', pd.NA, inplace=True)
# Replacing 'y' and 'n' with 1 and 0
x_congress_train.replace({'y': 1, 'n': 0}, inplace=True)
x_congress_test.replace({'y': 1, 'n': 0}, inplace=True)

# I have no clue why I need to do this replacement twice
# the original data stores NAs as "<NA>", which sklearn imputator cannot handle
# np.nan it can handle though
# when I run the second line of the following two, they are not replaced for some reason
# BUT!!! if I go the other direction first, and then replace it backwards again, it works
x_congress_train.replace({np.nan: "<NA>"}, inplace=True)
x_congress_train.replace({"<NA>": np.nan}, inplace=True)
#
x_congress_test.replace({np.nan: "<NA>"}, inplace=True)
x_congress_test.replace({"<NA>": np.nan}, inplace=True)


# applying the preprocessor
x_congress_train_processed = pd.DataFrame(preprocessor.fit_transform(x_congress_train))
x_congress_test_processed = pd.DataFrame(preprocessor.fit_transform(x_congress_test))
# slavage the original column names
x_congress_train_processed.columns = x_congress_train.columns
x_congress_test_processed.columns = x_congress_test.columns
x_congress_train_processed


Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
3,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
4,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
214,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.5,1.0
215,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
216,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0


In [43]:
x_congress_test_processed

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
1,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
3,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
213,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
214,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
215,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.5,0.0,1.0,0.0,1.0


In [44]:
print("before")
print("NAs in training set:", x_congress_train.isna().sum().sum())
print("NAs in training set:", x_congress_test.isna().sum().sum())
print("after")
print("NAs in training set:", x_congress_train_processed.isna().sum().sum())
print("NAs in training set:", x_congress_test_processed.isna().sum().sum())

before
NAs in training set: 206
NAs in training set: 186
after
NAs in training set: 0
NAs in training set: 0


In [45]:
y_congress_train.describe()

count          218
unique           2
top       democrat
freq           137
Name: class, dtype: object

In [46]:
x_congress_summary.loc["freq"].max

<bound method Series.max of handicapped-infants                       110
water-project-cost-sharing                 99
adoption-of-the-budget-resolution         131
physician-fee-freeze                      130
el-salvador-aid                           110
religious-groups-in-schools               131
anti-satellite-test-ban                   126
aid-to-nicaraguan-contras                 128
mx-missile                                105
immigration                               108
synfuels-crporation-cutback               134
education-spending                        125
superfund-right-to-sue                    103
crime                                     112
duty-free-exports                         108
export-administration-act-south-africa    130
Name: freq, dtype: object>

In [47]:
print(
    "Training features: ", x_congress_train_processed.shape, "\n",
    "Training labels: ", y_congress_train.shape, "\n",
    "Test features: ", x_congress_test_processed.shape, "\n",
    sep=""
)

Training features: (218, 16)
Training labels: (218,)
Test features: (217, 16)



In [48]:
with open('data/congress/preprocessed_congress_data.pickle', 'wb') as handle:
        pickle.dump(x_congress_train, handle)
        pickle.dump(x_congress_train_processed, handle)
        pickle.dump(y_congress_train, handle)
        pickle.dump(x_congress_test, handle)
        pickle.dump(x_congress_test_processed, handle)
        pickle.dump(congress_test_ID, handle)