In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification, make_blobs

#### Generate a simple dataset

In [2]:
# Create easily seperable blobs with 8 features
simple_dataset = make_blobs(n_samples=[50, 50], n_features=8, centers=None,
                            cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True,
                            random_state=27, return_centers=False)


In [3]:
simple_dataset

(array([[-2.65311508,  6.05242053,  2.44595521,  6.74406103, -1.54022833,
          8.36122368,  7.86849814, -5.03275027],
        [ 6.24954183,  3.34479671,  7.98772044,  5.1177021 ,  3.96252559,
          6.1548466 , -7.16589078, -2.07626559],
        [-0.65069154,  6.13997597,  3.27098297,  6.87441278, -0.49929825,
         10.0085335 ,  8.48113138, -6.64880074],
        [-1.21911932,  6.05935646,  6.1049881 ,  6.43540946, -4.50606972,
          8.25307072,  5.82752198, -6.66147654],
        [ 0.8845229 ,  8.28977492,  4.83762026,  7.96596389, -4.07209046,
          9.28890093,  8.84580334, -5.31097044],
        [ 3.92450595,  1.78784006,  7.81069074,  8.04226846,  5.78317778,
          6.9429924 , -8.39388165, -5.30433917],
        [ 3.54646933,  3.20355082,  6.54444545,  5.9294005 ,  4.95390505,
          6.86519003, -6.09803449, -0.31182613],
        [-0.66929331,  6.67407702,  3.38912617,  9.84556505, -2.36827459,
          9.42615707,  7.23390512, -6.43392241],
        [ 4.5751

In [4]:
# Rename features
simple_dataset_df_features = pd.DataFrame(np.array(simple_dataset[0]), columns=["feature1", "feature2", "feature3", "feature4", "feature5", 
                                                                                "feature6", "feature7", "feature8"])

In [5]:
# Create labels DataFrame seperately to avoid naming the classes column as 0
simple_dataset_df_labels = pd.DataFrame({"labels": np.array(simple_dataset[1])})

In [6]:
simple_dataset_df_features

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8
0,-2.653115,6.052421,2.445955,6.744061,-1.540228,8.361224,7.868498,-5.032750
1,6.249542,3.344797,7.987720,5.117702,3.962526,6.154847,-7.165891,-2.076266
2,-0.650692,6.139976,3.270983,6.874413,-0.499298,10.008534,8.481131,-6.648801
3,-1.219119,6.059356,6.104988,6.435409,-4.506070,8.253071,5.827522,-6.661477
4,0.884523,8.289775,4.837620,7.965964,-4.072090,9.288901,8.845803,-5.310970
...,...,...,...,...,...,...,...,...
95,-2.015363,7.833865,2.834384,6.231191,-3.131937,10.583076,7.969167,-6.924654
96,-1.328991,6.607484,4.445401,8.664791,-2.033190,9.963966,8.964449,-5.635555
97,4.935456,3.722993,10.201716,6.790453,5.361112,5.432218,-8.571552,-3.962914
98,1.646738,5.691306,2.368723,6.134181,-0.252837,10.182634,7.429468,-5.267134


In [7]:
simple_dataset_df_labels

Unnamed: 0,labels
0,0
1,1
2,0
3,0
4,0
...,...
95,0
96,0
97,1
98,0


In [8]:
# Concat the features and labels together to create the full dataset
simple_dataset_full = pd.concat([simple_dataset_df_features, simple_dataset_df_labels], axis=1)
simple_dataset_full

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,labels
0,-2.653115,6.052421,2.445955,6.744061,-1.540228,8.361224,7.868498,-5.032750,0
1,6.249542,3.344797,7.987720,5.117702,3.962526,6.154847,-7.165891,-2.076266,1
2,-0.650692,6.139976,3.270983,6.874413,-0.499298,10.008534,8.481131,-6.648801,0
3,-1.219119,6.059356,6.104988,6.435409,-4.506070,8.253071,5.827522,-6.661477,0
4,0.884523,8.289775,4.837620,7.965964,-4.072090,9.288901,8.845803,-5.310970,0
...,...,...,...,...,...,...,...,...,...
95,-2.015363,7.833865,2.834384,6.231191,-3.131937,10.583076,7.969167,-6.924654,0
96,-1.328991,6.607484,4.445401,8.664791,-2.033190,9.963966,8.964449,-5.635555,0
97,4.935456,3.722993,10.201716,6.790453,5.361112,5.432218,-8.571552,-3.962914,1
98,1.646738,5.691306,2.368723,6.134181,-0.252837,10.182634,7.429468,-5.267134,0


In [9]:
# Export to csv for classification
simple_dataset_full.to_csv('data/simple_dataset.csv', index=False)

#### Generate a simple dataset with redundant features and flipped y to create noise

In [10]:
simple_noisy_dataset = make_classification(n_samples=100, n_features=8, n_informative=4, n_redundant=4, n_repeated=0,
                                      n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.1,
                                      class_sep=1, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=27)

In [11]:
# Rename features
simple_noisy_dataset_df_features = pd.DataFrame(np.array(simple_noisy_dataset[0]), columns=["feature1", "feature2", "feature3", "feature4", "feature5", 
                                                                                "feature6", "feature7", "feature8"])

In [12]:
# Create labels DataFrame seperately to avoid naming the classes column as 0
simple_noisy_dataset_df_labels = pd.DataFrame({"labels": np.array(simple_noisy_dataset[1])})

In [13]:
# Concat into full dataset
simple_noisy_dataset_full = pd.concat([simple_noisy_dataset_df_features, simple_noisy_dataset_df_labels], axis=1)
simple_noisy_dataset_full

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,labels
0,0.547218,0.117659,-0.466834,-1.214104,0.554416,-0.171233,0.076779,-0.822551,0
1,1.260256,0.869941,1.666619,-2.123211,-2.023184,-1.442386,0.556908,1.808193,0
2,-2.627469,2.225614,-1.045045,1.906242,0.446663,1.807224,-2.053173,-2.172053,1
3,-1.944252,2.032486,0.752929,1.825775,-1.656108,0.781387,-1.248109,0.492632,1
4,-0.106718,-0.179812,-0.918795,-0.986007,2.538731,0.081608,-0.999492,-1.907978,1
...,...,...,...,...,...,...,...,...,...
95,2.849848,-1.694915,-1.501459,-1.096969,-1.985549,-0.049947,3.885924,0.207834,1
96,-3.545575,1.015381,1.159851,2.450329,2.878726,0.559659,-4.222957,-0.452178,0
97,-0.465274,0.865868,1.512376,-0.479992,-0.185210,-0.751471,-1.188317,1.019099,0
98,-3.333714,2.534845,2.686473,1.622743,0.455288,-0.171383,-4.161840,1.154692,0


In [14]:
# Export csv for classification
simple_noisy_dataset_full.to_csv('data/simple_noisy_dataset.csv', index=False)

#### Reduce the class seperation to increase the difficulty of the classification problem

In [15]:
complex_dataset = make_classification(n_samples=100, n_features=8, n_informative=8, n_redundant=0, n_repeated=0,
                                      n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01,
                                      class_sep=0.1, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=27)


In [16]:
# Rename features
complex_dataset_df_features = pd.DataFrame(np.array(complex_dataset[0]), columns=["feature1", "feature2", "feature3", "feature4", "feature5", 
                                                                                "feature6", "feature7", "feature8"])

In [17]:
# Create labels DataFrame seperately to avoid naming the classes column as 0
complex_dataset_df_labels = pd.DataFrame({"labels": np.array(complex_dataset[1])})

In [18]:
# Concat into full dataset
complex_dataset_full = pd.concat([complex_dataset_df_features, complex_dataset_df_labels], axis=1)
complex_dataset_full

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,labels
0,0.195351,2.100090,-2.420573,0.629284,-1.047139,0.771754,-1.540232,-2.490717,1
1,-2.645260,1.235467,2.170054,-0.338245,0.113611,1.022120,-0.761678,-0.101435,0
2,3.539747,0.591097,3.358954,-5.904391,0.309390,-2.308706,-1.099218,4.250243,1
3,-0.911676,-0.617754,-2.928706,1.629786,-3.080454,0.074552,0.784917,-0.217459,1
4,0.099748,0.421693,-0.167761,-0.427883,-1.182734,-0.166955,0.623783,-1.494289,1
...,...,...,...,...,...,...,...,...,...
95,-3.294570,0.748689,-4.831375,3.024696,-0.993260,2.214474,1.950820,-0.379414,1
96,0.989697,-1.799076,1.411661,0.579070,1.633876,-0.615343,1.165504,2.135408,0
97,-1.694247,-2.955080,-0.615661,-0.634212,-0.040323,-0.662981,1.563497,2.499553,0
98,-0.471845,-2.123524,-1.252569,-0.773259,-0.242619,-4.524305,-1.251529,-1.997483,0


In [19]:
# Export to csv for classification
complex_dataset_full.to_csv('data/complex_dataset.csv', index=False)