In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Mushroom dataset

In [4]:
df_mushroom = pd.read_csv('agaricus-lepiota.data', header=None)
df_mushroom.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
# unique values in each column
for col in df_mushroom.columns:
    print(col, df_mushroom[col].unique())
    

0 ['p' 'e']
1 ['x' 'b' 's' 'f' 'k' 'c']
2 ['s' 'y' 'f' 'g']
3 ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
4 ['t' 'f']
5 ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
6 ['f' 'a']
7 ['c' 'w']
8 ['n' 'b']
9 ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
10 ['e' 't']
11 ['e' 'c' 'b' 'r' '?']
12 ['s' 'f' 'k' 'y']
13 ['s' 'f' 'y' 'k']
14 ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
15 ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
16 ['p']
17 ['w' 'n' 'o' 'y']
18 ['o' 't' 'n']
19 ['p' 'e' 'l' 'f' 'n']
20 ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
21 ['s' 'n' 'a' 'v' 'y' 'c']
22 ['u' 'g' 'm' 'd' 'p' 'w' 'l']


In [6]:
# sum of all the unique values in each column
print(sum([len(df_mushroom[col].unique()) for col in df_mushroom.columns]))

119


In [7]:
# summary of the data
df_mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   object
 2   2       8124 non-null   object
 3   3       8124 non-null   object
 4   4       8124 non-null   object
 5   5       8124 non-null   object
 6   6       8124 non-null   object
 7   7       8124 non-null   object
 8   8       8124 non-null   object
 9   9       8124 non-null   object
 10  10      8124 non-null   object
 11  11      8124 non-null   object
 12  12      8124 non-null   object
 13  13      8124 non-null   object
 14  14      8124 non-null   object
 15  15      8124 non-null   object
 16  16      8124 non-null   object
 17  17      8124 non-null   object
 18  18      8124 non-null   object
 19  19      8124 non-null   object
 20  20      8124 non-null   object
 21  21      8124 non-null   object
 22  22      8124 non-null   

In [8]:
# proportion of 'p' and 'e' in the class column
df_mushroom.iloc[:, 0].value_counts(normalize=True)

0
e    0.517971
p    0.482029
Name: proportion, dtype: float64

# Bank Dataset

In [9]:
import arff
import pandas as pd

# Load the ARFF file
with open('bank-additional-ful-nominal.arff') as f:
    dataset = arff.load(f)

# Convert to DataFrame
df_bank = pd.DataFrame(dataset['data'])

# Optionally, set column names if they are present in the ARFF file
if 'attributes' in dataset.keys():
    df_bank.columns = [attr[0] for attr in dataset['attributes']]

df_bank.head()


Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent,no
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent,no
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent,no
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent,no


In [10]:
# summary of the data
df_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job          41188 non-null  object
 1   marital      41188 non-null  object
 2   education    41188 non-null  object
 3   default      41188 non-null  object
 4   housing      41188 non-null  object
 5   loan         41188 non-null  object
 6   contact      41188 non-null  object
 7   month        41188 non-null  object
 8   day_of_week  41188 non-null  object
 9   poutcome     41188 non-null  object
 10  y            41188 non-null  object
dtypes: object(11)
memory usage: 3.5+ MB


In [11]:
# statistics of the data
df_bank.describe()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
count,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188
unique,12,4,8,3,3,3,2,10,5,3,2
top,admin.,married,university.degree,no,yes,no,cellular,may,thu,nonexistent,no
freq,10422,24928,12168,32588,21576,33950,26144,13769,8623,35563,36548


In [12]:
# unique values in each column
for col in df_bank.columns:
    print(col, df_bank[col].unique(), len(df_bank[col].unique()))

job ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student'] 12
marital ['married' 'single' 'divorced' 'unknown'] 4
education ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate'] 8
default ['no' 'unknown' 'yes'] 3
housing ['no' 'yes' 'unknown'] 3
loan ['no' 'yes' 'unknown'] 3
contact ['telephone' 'cellular'] 2
month ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep'] 10
day_of_week ['mon' 'tue' 'wed' 'thu' 'fri'] 5
poutcome ['nonexistent' 'failure' 'success'] 3
y ['no' 'yes'] 2


In [13]:
# number of unique values in each column
sum([len(df_bank[col].unique()) for col in df_bank.columns])

55

In [14]:
# proportion of  yes and no in the target column
df_bank['y'].value_counts(normalize=True)

y
no     0.887346
yes    0.112654
Name: proportion, dtype: float64

In [15]:
# number of unique values in each column
df_bank['y'].value_counts()

y
no     36548
yes     4640
Name: count, dtype: int64

# CMC

In [16]:
import arff
with open('cmc-nominal.arff') as f:
    dataset = arff.load(f)
    
df_cmc = pd.DataFrame(dataset['data'])

if 'attributes' in dataset.keys():
    df_cmc.columns = [attr[0] for attr in dataset['attributes']]
    
df_cmc.head()

Unnamed: 0,Wifes_education,Husbands_education,Wifes_religion,Wifes_now_working?,Husbands_occupation,Standard-of-living_index,Media_exposure,Contraceptive_method_used,class_numberofchildren
0,2,3,1,1,2,3,0,1,0
1,1,3,1,1,3,4,0,1,1
2,2,3,1,1,3,4,0,1,0
3,3,2,1,1,3,3,0,1,0
4,3,3,1,1,3,2,0,1,0


In [17]:
# summary of the data
df_cmc.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Wifes_education            1473 non-null   object
 1   Husbands_education         1473 non-null   object
 2   Wifes_religion             1473 non-null   object
 3   Wifes_now_working?         1473 non-null   object
 4   Husbands_occupation        1473 non-null   object
 5   Standard-of-living_index   1473 non-null   object
 6   Media_exposure             1473 non-null   object
 7   Contraceptive_method_used  1473 non-null   object
 8   class_numberofchildren     1473 non-null   object
dtypes: object(9)
memory usage: 103.7+ KB


In [18]:
# statistics of the data
df_cmc.describe()

Unnamed: 0,Wifes_education,Husbands_education,Wifes_religion,Wifes_now_working?,Husbands_occupation,Standard-of-living_index,Media_exposure,Contraceptive_method_used,class_numberofchildren
count,1473,1473,1473,1473,1473,1473,1473,1473,1473
unique,4,4,2,2,4,4,2,3,2
top,4,4,1,1,3,4,0,1,0
freq,577,899,1253,1104,585,684,1364,629,1444


In [19]:
# unique values in each column
for col in df_cmc.columns:
    print(col, df_cmc[col].unique(), len(df_cmc[col].unique()))

Wifes_education ['2' '1' '3' '4'] 4
Husbands_education ['3' '2' '4' '1'] 4
Wifes_religion ['1' '0'] 2
Wifes_now_working? ['1' '0'] 2
Husbands_occupation ['2' '3' '1' '4'] 4
Standard-of-living_index ['3' '4' '2' '1'] 4
Media_exposure ['0' '1'] 2
Contraceptive_method_used ['1' '2' '3'] 3
class_numberofchildren ['0' '1'] 2


In [20]:
# number of unique values in each column
sum([len(df_cmc[col].unique()) for col in df_cmc.columns])

27

In [21]:
# proportion of  yes and no in the target column
df_cmc['class_numberofchildren'].value_counts(normalize=True)

class_numberofchildren
0    0.980312
1    0.019688
Name: proportion, dtype: float64

In [22]:
# number of yes and no in the target column
df_cmc['class_numberofchildren'].value_counts()

class_numberofchildren
0    1444
1      29
Name: count, dtype: int64

# Chesss

In [24]:
import arff
with open('chess_krkopt_zerovsall.arff') as f:
    dataset = arff.load(f)
    
df_chess = pd.DataFrame(dataset['data'])

if 'attributes' in dataset.keys():
    df_chess.columns = [attr[0] for attr in dataset['attributes']]
    
df_chess.head()

Unnamed: 0,White_King_file,White_King_rank,White_Rook_file,White_Rook_rank,Black_King_file,Black_King_rank,class
0,d,4,h,7,g,7,0
1,d,4,f,5,a,1,0
2,d,1,a,7,d,5,0
3,d,1,e,8,f,4,0
4,c,2,c,4,f,5,0


In [25]:
# summary of the data
df_chess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28056 entries, 0 to 28055
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   White_King_file  28056 non-null  object
 1   White_King_rank  28056 non-null  object
 2   White_Rook_file  28056 non-null  object
 3   White_Rook_rank  28056 non-null  object
 4   Black_King_file  28056 non-null  object
 5   Black_King_rank  28056 non-null  object
 6   class            28056 non-null  object
dtypes: object(7)
memory usage: 1.5+ MB


In [26]:
# unique values in each column
for col in df_chess.columns:
    print(col, df_chess[col].unique(), len(df_chess[col].unique()))

White_King_file ['d' 'c' 'a' 'b'] 4
White_King_rank ['4' '1' '2' '3'] 4
White_Rook_file ['h' 'f' 'a' 'e' 'c' 'd' 'b' 'g'] 8
White_Rook_rank ['7' '5' '8' '4' '6' '1' '2' '3'] 8
Black_King_file ['g' 'a' 'd' 'f' 'h' 'c' 'b' 'e'] 8
Black_King_rank ['7' '1' '5' '4' '3' '2' '8' '6'] 8
class ['0' '1'] 2


In [27]:
# number of unique values in each column
sum([len(df_chess[col].unique()) for col in df_chess.columns])

42

In [28]:
# proportion of  yes and no in the target column
df_chess['class'].value_counts(normalize=True)


class
0    0.999038
1    0.000962
Name: proportion, dtype: float64