In [1]:
# Make sure to also output the intermediary steps
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Auto-Cleaning Dirty Data: the Data Encoding Bot

[2IMM00] Seminar Data Mining
<br>
Angelo Majoor - 1030843
<br>
A.R.Majoor@student.tue.nl

Supervisor: dr. ir. J. (Joaquin) Vanschoren

Eindhoven University of Technology
<br>
Department of Mathematics and Computer Science
<br>
Data Mining Research Group

In [2]:
# Import all relevant libraries
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

from nltk.corpus import wordnet as wn

### The three different steps

As stated in the report, the process of creating the data encoding bot consists of three different steps:

 - Auto-detecting data types per feature (column)
 - Auto-detecting numeric, ordinal, categorical (integer) features
 - Auto-selecting encoding techniques for all features

## Step 0: Import the Data

In [3]:
#raw_data = pd.read_csv("ENTER_YOUR_CSV_FILE_NAME_HERE.csv")
raw_data = pd.read_csv("kickstarter.csv") #Example

# Now simply run the entire Jupyter notebook!

raw_data.head()

print ("The provided data set consists of",raw_data.shape[0],"rows and",raw_data.shape[1],"columns (features.")

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01 03:43:57,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0


The provided data set consists of 378661 rows and 13 columns (features.


## Step 1: Auto-Detecting Data Types

In [4]:
# Function to automatically infer data types for a specific feature that has the standard 'object' data type
# Data types that we want to infer: boolean, date, float, integer, string
# Note that every feature that is not either a boolean, a date, a float or an integer, is classified as a string
# Input: Pandas Dataframe consisting of one single feature (so n*1 in size)
# Output: Data type of the feature (in string format)

def autoInferObject(raw_data_feature):
    dataType = ""
    types = ["date","float64","int64","string"]
    weights = [0,0,0,0] #Weights corresponding to the data types
    
    featureLength = len(raw_data_feature)
    
    numberOfIndices = 100 #Number of different values to check in a feature
    randomIndices = random.sample(range(0,featureLength), min(numberOfIndices,featureLength)) #Array of random indices
    
    # If the feature only contains two different unique values, then infer it as boolean
    if len(pd.unique(raw_data_feature)) == 2:
        dataType = "bool"
    else:
        for i in randomIndices:
            try:
                if (len(raw_data_feature[i]) <= 10 
                    and (((raw_data_feature[i][2:3]=='-' or raw_data_feature[i][2:3]=='/') 
                    and (raw_data_feature[i][5:6]=='-' or raw_data_feature[i][5:6]=='/')) 
                    or ((raw_data_feature[i][4:5]=='-' or raw_data_feature[i][4:5]=='/')
                    and (raw_data_feature[i][7:8]=='-' or raw_data_feature[i][7:8]=='/')))):
                    weights[0] += 1 #Date
                else:
                    weights[3] += 1 #String
            except (TypeError,ValueError,IndexError):
                try:
                    int(raw_data_feature[i])
                    if ('.' in str(raw_data_feature[i])):
                        weights[1] += 1 #Float
                    else:
                        weights[2] += 1 #Integer
                except (TypeError,ValueError,IndexError):
                    weights[3] += 1 #String
    
        #print ("Date: {}, Float64: {}, Int64: {}, String: {}".format(weights[0],weights[1],weights[2],weights[3])) #For debugging purposes
        dataType = types[weights.index(max(weights))]
        
    return dataType

In [5]:
# Function to automatically infer data types for every single feature in a raw data set
# Input: Pandas Dataframe created directly from the raw data with the pd.read_csv function
# Output: List of data types, one data type for each feature

def autoDetectDataTypes(raw_data):
    result = []
    
    for column in raw_data:
        if (raw_data.dtypes[column] == "object"):
            #print ("Trying to automatically infer the data type of the",column,"feature...") #For debugging purposes
            inferredType = autoInferObject(raw_data[column])
            result.append(inferredType)
            #print ("Result:",inferredType) #For debugging purposes
        elif (raw_data.dtypes[column] == "int64"):
            if (len(pd.unique(raw_data[column])) == 2):
                result.append("bool")
            else:
                result.append("int64")
        else:
            # The only remaining data type is 'float64', which needs no special treatment
            result.append("float64")
        
    return result

In [6]:
predicted = autoDetectDataTypes(raw_data)
print ("\nPredicted data types:\n",predicted)


Predicted data types:
 ['int64', 'string', 'string', 'string', 'string', 'string', 'float64', 'string', 'float64', 'string', 'int64', 'string', 'float64']


### Testing the accuracy of the implemented solution

In [7]:
# Manually check for the ground truth, since a dirty data set has no ground truth included

#ground_truth = ["ENTER_GROUND_TRUTH_DATA_TYPES_FOR_THE_PROVIDED_DATA_SET"]
ground_truth = ['int64', 'string', 'string', 'string', 'string', 'string', 'float64', \
                'string', 'float64', 'string', 'int64', 'string', 'float64'] #Example

print ("Provided ground truth:\n",ground_truth)

Provided ground truth:
 ['int64', 'string', 'string', 'string', 'string', 'string', 'float64', 'string', 'float64', 'string', 'int64', 'string', 'float64']


In [8]:
# Function to calculate an accuracy score for the implemented solution
# Input: Array containing the (self-made) ground truth
#        Array containing the predicted data types
# Output: Accuracy score based on the number of correct predictions

def score(ground_truth, predicted):
    correctPredictions = 0
    
    for i in range(0,len(ground_truth)):
        if ground_truth[i] == predicted[i]:
            correctPredictions += 1
        
    return correctPredictions / len(ground_truth)

In [9]:
print ("Accuracy:",score(ground_truth, predicted)*100,"%\nNumber of features checked:",len(predicted))

Accuracy: 100.0 %
Number of features checked: 13


### Results

In [10]:
scoreResults = [("Solar Panel Energy Production Eindhoven",100.00,7,"https://canvas.tue.nl/files/508283"), \
                ("Weather Measurements Eindhoven Airport",100.00,9,"https://canvas.tue.nl/files/508283"), \
                ("TED Talks",100.00,17,"https://www.kaggle.com/rounakbanik/ted-talks"), \
                ("Wine Reviews",100.00,11,"https://www.kaggle.com/zynicide/wine-reviews"), \
                ("Fake News",100.00,20,"https://www.kaggle.com/mrisdal/fake-news"), \
                ("Electronic Music",100.00,14,"https://www.kaggle.com/marcschroeder/17-years-of-resident-advisor-reviews"), \
                ("Crypto Currency",100.00,13,"https://www.kaggle.com/jessevent/all-crypto-currencies"), \
                ("Google Job Skills",100.00,7,"https://www.kaggle.com/niyamatalmass/google-job-skills"), \
                ("Spotify Song Ranking",100.00,7,"https://www.kaggle.com/edumucelli/spotifys-worldwide-daily-song-ranking"), \
                ("Kickstarter Projects",100.00,13,"https://www.kaggle.com/kemical/kickstarter-projects")]
pd.DataFrame(scoreResults, columns=["Data Set", "Accuracy", "Features Checked", "Retrieved From"])

print ("Results for step 1, data type detection.")

Unnamed: 0,Data Set,Accuracy,Features Checked,Retrieved From
0,Solar Panel Energy Production Eindhoven,100.0,7,https://canvas.tue.nl/files/508283
1,Weather Measurements Eindhoven Airport,100.0,9,https://canvas.tue.nl/files/508283
2,TED Talks,100.0,17,https://www.kaggle.com/rounakbanik/ted-talks
3,Wine Reviews,100.0,11,https://www.kaggle.com/zynicide/wine-reviews
4,Fake News,100.0,20,https://www.kaggle.com/mrisdal/fake-news
5,Electronic Music,100.0,14,https://www.kaggle.com/marcschroeder/17-years-...
6,Crypto Currency,100.0,13,https://www.kaggle.com/jessevent/all-crypto-cu...
7,Google Job Skills,100.0,7,https://www.kaggle.com/niyamatalmass/google-jo...
8,Spotify Song Ranking,100.0,7,https://www.kaggle.com/edumucelli/spotifys-wor...
9,Kickstarter Projects,100.0,13,https://www.kaggle.com/kemical/kickstarter-pro...


Results for step 1, data type detection.


### Flaws

 - Dates that are represented as integers cannot be detected, since it is unknown whether a value represents a date, or an actual number.
 - Dates that are represented by single digits when it concerns the numbers 0 to 9 are not detected and should instead be represented by their 2-digit version 01 to 09 respectively.

## Step 2: Numeric, Categorical or Ordinal Data?

In [11]:
# Function to check if a feature contains categorical data, specifically concerning strings
# If a feature contains at most 25 unique elements, this feature is always considered as categorical
# If a feature contains between 26 and k (where k is user specified) unique values, then the function 
# calculates a similarity score between all of the different values. If this score is higher than 0.70 
# a feature is also considered to be categorical (since the values at least have some relationship).
# Input: Pandas Dataframe consisting of one single feature (so n*1 in size)
#        A user-determined value k (the critical value: more than k unique values cannot be categorical)
# Output: A boolean stating whether the supplied feature is categorical or not

def autoCheckCategoricalString(raw_data_feature, k=100):
    categorical = False
    
    allWords = pd.unique(raw_data_feature) #All unique strings in the feature
    similarityScores = []
    
    if (len(allWords) <= 25): #If there are less than 25 unique strings, it is categorical
        categorical = True
        #print ("Less than or equal to 25 unique strings (in this case",len(allWords),")") #For debugging purposes
        
    elif (len(allWords) <= k): #Else if there are less than k unique strings, check for similarity
        
        for i in range(0,len(allWords)-1):
            for j in range(i+1,len(allWords)):
                if (pd.isnull(allWords[i])): #If a string has no value (NaN), turn it into some nonsense
                    allWords[i] = "abcdef"
                elif (pd.isnull(allWords[j])):
                    allWords[j] = "abcdef"
            
                word_1 = wn.synsets(allWords[i])
                word_2 = wn.synsets(allWords[j])
            
                if (word_1 != [] and word_2 != []): #Calculate similarity between two non-empty words
                    similarity = wn.wup_similarity(word_1[0], word_2[0])
                    #print ("Similarity between",word_1[0],"and",word_2[0],":",similarity) #For debugging purposes
                    if (not pd.isnull(similarity)):
                        similarityScores.append(similarity)
        
        #print ("Similarity Scores:\n",similarityScores) #For debugging purposes
        #print ("Mean Similarity Score:",np.mean(similarityScores)) #For debugging purposes
    
        if (np.mean(similarityScores) > 0.50): #0.50 = Critical similarity value
            categorical = True
    
    #print ("Categorical Feature?",categorical) #For debugging purposes
    
    return categorical

In [12]:
# Function to check if a feature contains categorical data, specifically concerning integers
# If a feature contains at most 10 unique elements, this feature is always considered as categorical
# If a feature contains between 11 and k (where k is user specified) unique values, then the function 
# calculates a distance score between all of the different values. If this score is smaller than 0.1
# times the mean off all the integers a feature is also considered to be categorical (since the 
# values have a relatively similar distance between each other).
# Input: Pandas Dataframe consisting of one single feature (so n*1 in size)
#        A user-determined value k (the critical value: more than k unique values cannot be categorical)
# Output: A boolean stating whether the supplied feature is categorical or not

def autoCheckCategoricalInt(raw_data_feature, k=100):
    categorical = False
    
    allInts = pd.unique(raw_data_feature) #All unique integers in the feature
    distanceScores = []
    
    if (len(allInts) <= 10): #If there are less tahn 10 unique integers, it is categorical
        categorical = True
        #print ("Less than or equal to 10 unique integers (in this case",len(allInts),")") #For debugging purposes
    
    elif (len(allInts) <= k): #Else if there are less than k unique integers, check for distance
        
        for i in range(0,len(allInts)-1):
            for j in range(i+1,len(allInts)):
                distance = abs(allInts[i] - allInts[j]) #Calculate absolute distance between two integers
                #print ("Distance between integer",allInts[i],"and",allInts[j],":",distance) #For debugging purposes
                distanceScores.append(distance)
        
        #print ("Distance Scores:\n",distanceScores) #For debugging purposes
        #print ("Mean Distance Score:",np.mean(distanceScores),",should be lower than:",np.mean(allInts)) #For debugging purposes
        
        if (np.mean(distanceScores) < (np.mean(allInts))):
            categorical = True
    
    #print ("Categorical Feature?",categorical) #For debuggin purposes

    return categorical

In [13]:
# Function to automatically decide on categoricals for every single feature in a raw data set
# Input: Pandas Dataframe created directly from the raw data with the pd.read_csv function
#        Array of data types, which is the output of step 1: autoDetectDataTypes
#        [Optional] Integer specifying the critical value for which features will be checked
#           on categoricals. That is, if the unique number of elements in a feature is higher
#           than the critical value, a feature cannot be categorical. (Default value is 100)
# Output: List of data types, one data type for each feature

def autoCheckCategoricals(raw_data,predicted,k=100):
    
    #print ("Checking if any of the features has categorical data...") #For debugging purposes
    
    for j in range(0,len(predicted)):
        if (predicted[j] == 'int64'):
            if (autoCheckCategoricalInt(raw_data.iloc[:,j],k)):
                predicted[j] = 'cat_int64'
        elif (predicted[j] == 'string'):
            if (autoCheckCategoricalString(raw_data.iloc[:,j],k)):
                predicted[j] = 'cat_string'
                
    return predicted

In [14]:
predicted = autoCheckCategoricals(raw_data,predicted,100)
print ("\nPredicted data types:\n",predicted)


Predicted data types:
 ['int64', 'string', 'string', 'cat_string', 'cat_string', 'string', 'float64', 'string', 'float64', 'cat_string', 'int64', 'cat_string', 'float64']


### Testing the accuracy of the implemented solution

In [15]:
# Manually check for the ground truth, since a dirty data set has no ground truth included

#ground_truth = ["ENTER_GROUND_TRUTH_DATA_TYPES_FOR_THE_PROVIDED_DATA_SET"]
ground_truth = ['int64', 'string', 'cat_string', 'cat_string', 'cat_string', 'string', 'float64', \
                'string', 'float64', 'cat_string', 'int64', 'cat_string', 'float64'] #Example

print ("Provided ground truth:\n",ground_truth)

Provided ground truth:
 ['int64', 'string', 'cat_string', 'cat_string', 'cat_string', 'string', 'float64', 'string', 'float64', 'cat_string', 'int64', 'cat_string', 'float64']


In [16]:
#ground_truth[:] = [x for x in ground_truth if x != 'bool']
#ground_truth[:] = [x for x in ground_truth if x != 'date']
#ground_truth[:] = [x for x in ground_truth if x != 'float64']

#predicted[:] = [x for x in predicted if x != 'bool']
#predicted[:] = [x for x in predicted if x != 'date']
#predicted[:] = [x for x in predicted if x != 'float64']

print ("Accuracy:",score(ground_truth, predicted)*100,"%\nNumber of features checked:",len(predicted))

Accuracy: 92.3076923076923 %
Number of features checked: 13


### Results

In [17]:
scoreResults = [("Solar Panel Energy Production Eindhoven",100.00,3,"https://canvas.tue.nl/files/508283"), \
                ("Weather Measurements Eindhoven Airport",100.00,2,"https://canvas.tue.nl/files/508283"), \
                ("TED Talks",88.24,17,"https://www.kaggle.com/rounakbanik/ted-talks"), \
                ("Wine Reviews",70.00,10,"https://www.kaggle.com/zynicide/wine-reviews"), \
                ("Fake News",94.44,18,"https://www.kaggle.com/mrisdal/fake-news"), \
                ("Electronic Music",90.91,11,"https://www.kaggle.com/marcschroeder/17-years-of-resident-advisor-reviews"), \
                ("Crypto Currency",100.00,6,"https://www.kaggle.com/jessevent/all-crypto-currencies"), \
                ("Job Skills",100.00,6,"https://www.kaggle.com/niyamatalmass/google-job-skills"), \
                ("Spotify Song Ranking",83.33,6,"https://www.kaggle.com/edumucelli/spotifys-worldwide-daily-song-ranking"), \
                ("Kickstarter Projects",90.00,10,"https://www.kaggle.com/kemical/kickstarter-projects")]
pd.DataFrame(scoreResults, columns=["Data Set", "Accuracy", "Features Checked", "Retrieved From"])

print ("Results for step 2, categorical data checking, based on a user-specified k value of 100.")

Unnamed: 0,Data Set,Accuracy,Features Checked,Retrieved From
0,Solar Panel Energy Production Eindhoven,100.0,3,https://canvas.tue.nl/files/508283
1,Weather Measurements Eindhoven Airport,100.0,2,https://canvas.tue.nl/files/508283
2,TED Talks,88.24,17,https://www.kaggle.com/rounakbanik/ted-talks
3,Wine Reviews,70.0,10,https://www.kaggle.com/zynicide/wine-reviews
4,Fake News,94.44,18,https://www.kaggle.com/mrisdal/fake-news
5,Electronic Music,90.91,11,https://www.kaggle.com/marcschroeder/17-years-...
6,Crypto Currency,100.0,6,https://www.kaggle.com/jessevent/all-crypto-cu...
7,Job Skills,100.0,6,https://www.kaggle.com/niyamatalmass/google-jo...
8,Spotify Song Ranking,83.33,6,https://www.kaggle.com/edumucelli/spotifys-wor...
9,Kickstarter Projects,90.0,10,https://www.kaggle.com/kemical/kickstarter-pro...


Results for step 2, categorical data checking, based on a user-specified k value of 100.


### Flaws

 - Determining the ground_truth of a data set can already be hard for human beings in a lot of cases, so testing for an accuracy score becomes quite subjective (since the ground_truth is my personal idea of what the ground_truth should be).
 - Because we are dealing with raw data, the used approach for strings fails when words, or certain strings are being used that are not common language.
 - Determining the critical values for which strings and integers should always be categorical is again very subjective and as such, no guarantees for these values can be given (thus, needs further research).
 - Finding an order in raw data is even harder for human beings, and unfortunately, ordinal data is not being checked for in the used approach.

## Step 3: Auto-Selecting Encoding Techniques

In [18]:
# Function to automatically enocde a features from a raw data set
# Input: Pandas Dataframe consisting of one single feature (so n*1 in size)
#        [Optional] A user-determined value numberOfOccurrences, which states the 
#           minimum number of occurrences for a value in the provided feature. For 
#           example, when numberOfOccurrences equals 3, all unique values that 
#           occur less than three times will not be One Hot Encoded and are removed 
#           from the data set. (Default is infinite, so all unique values are OHE.)
#        [Optional] A boolean stating whether the removed values should be represented
#           in a One Hot Encoded feature 'other'. (Default is False)
# Output: Pandas Dataframe representing the One Hot Encoded feature

def autoEncodeFeature(raw_data_feature, numberOfOccurrences=np.inf, other=False):
    
    oheFeature = pd.get_dummies(raw_data_feature)
    
    if (numberOfOccurrences != np.inf):
        
        dropColumns = []
        
        for i in range(0, len(oheFeature.columns)):
            column = oheFeature.iloc[:,i].value_counts()
            
            if (column[1] < numberOfOccurrences):
                dropColumns.append(i)
    
        if (other):
            otherValues = []
            for j in range(0, len(raw_data_feature)):
                for k in dropColumns:
                    if (oheFeature.iloc[:,k][j] == 1):
                        otherValues.append(1)
                        break
                else:
                    otherValues.append(0)
            oheFeature = pd.concat([oheFeature, pd.DataFrame(otherValues, columns=["Other"])], axis=1)

        oheFeature.drop(oheFeature.columns[dropColumns], axis=1, inplace=True)
        
    #print ("Number of features after One Hot Encoding:",len(oheFeature.columns)) #For debugging purposes
    
    return oheFeature

In [19]:
# Function to automatically encode all features in a raw data set
# Input: Pandas Dataframe created directly from the raw data with the pd.read_csv function
#        Array of data types, which is the output of step 2: autoCheckCategoricals
#        [Optional] A user-determined value numberOfOccurrences, which states the 
#           minimum number of occurrences for a value in the provided feature. For 
#           example, when numberOfOccurrences equals 3, all unique values that 
#           occur less than three times will not be One Hot Encoded and are removed 
#           from the data set. (Default is infinite, so all unique values are OHE.)
#        [Optional] A boolean stating whether the removed values should be represented
#           in a One Hot Encoded feature 'other'. (Default is False)
# Output: Pandas Dataframe with all categorical features One Hot Encoded

def autoEncodeFeatures(raw_data, predicted, numberOfOccurrences=np.inf, other=False):
    
    dropColumns = []
    
    for i in range(0, len(predicted)):
        if (predicted[i] == 'cat_string' or predicted[i] == 'cat_int64'):
            dropColumns.append(i)
            raw_data = pd.concat([raw_data,autoEncodeFeature(raw_data.iloc[:,i],numberOfOccurrences,other)], axis=1)
    
    raw_data.drop(raw_data.columns[dropColumns], axis=1, inplace=True)
    
    return raw_data

In [20]:
# Function to automatically encode all features in a raw data set
# Exactly similar to 'autoEncodeFeatures', with the only difference that this version
# also factorizes the non-categorical string and boolean features, whilst the previous 
# version keeps the original values within these features.

def autoEncodeFeatures2(raw_data, predicted, numberOfOccurrences=np.inf, other=False):
    
    dropColumns = []
    
    for i in range(0, len(predicted)):
        if (predicted[i] == 'cat_string' or predicted[i] == 'cat_int64'):
            dropColumns.append(i)
            raw_data = pd.concat([raw_data,autoEncodeFeature(raw_data.iloc[:,i],numberOfOccurrences,other)], axis=1)
        
        elif (predicted[i] == 'string' or predicted[i] == 'bool' or predicted[i] == 'date'):
            factorize = pd.factorize(raw_data.iloc[:,i])
            raw_data.iloc[:,i] = factorize[0]
    
    raw_data.drop(raw_data.columns[dropColumns], axis=1, inplace=True)
    
    return raw_data

### Flaws

 - Determining an optimal encoding for a raw data set can be very hard, since the data encoding bot has no knowledge about the actual semantical meaning of features, which is known by a person that usually performs the data cleaning. This results in generalities that might give undesired results. (For example, the 'date' feature of the 'Energy' data set is factorized and it is assumed that this date will be used as an input to a Machine Learning algorithm, whilst this feature might simply be used as an identifier within the data set.)

## Using the Auto-Encoding Bot

In [21]:
# Function to create an encoding for an input data set
# Input: Pandas Dataframe created directly from the raw data with the pd.read_csv function
#        [Optional] Integer specifying the critical value for which features will be checked
#           on categoricals. That is, if the unique number of elements in a feature is higher
#           than the critical value, a feature cannot be categorical. (Default value is 100)
#        [Optional] A user-determined value numberOfOccurrences, which states the 
#           minimum number of occurrences for a value in the provided feature to be 
#           One Hot Encoded. For example, when numberOfOccurrences equals 3, all 
#           unique values that occur less than three times will not be One Hot 
#           Encoded and are removed from the data set. (Default is infinite)
#        [Optional] A boolean stating whether the removed values should be represented
#           in a One Hot Encoded feature 'other'. (Default is False)
#        [Optional] A boolean stating whether to use version1 or version2 for the encoding 
#           step. Version2 factorizes all non-categorical string, boolean and date columns, 
#           while version1 leaves non-categorical string, boolean and date features as they are.
# Output: Pandas Dataframe with all categorical features One Hot Encoded

def theAutoEncodingBot(raw_data, categoricalUniques=100, numberOfOccurrences=np.inf, other=False, version2=True):
    predicted = autoDetectDataTypes(raw_data)
    predicted = autoCheckCategoricals(raw_data,predicted,categoricalUniques)
    if (version2==False):
        encoding = autoEncodeFeatures(raw_data,predicted,numberOfOccurrences,other)
    else:
        encoding = autoEncodeFeatures2(raw_data,predicted,numberOfOccurrences,other)
    
    return encoding

In [22]:
print ("Encoding for the provided data set:")
theAutoEncodingBot(raw_data)

Encoding for the provided data set:


Unnamed: 0,ID,name,category,deadline,goal,launched,pledged,backers,usd pledged,Art,...,JP,LU,MX,"N,0""",NL,NO,NZ,SE,SG,US
0,1000002330,0,0,0,1000.0,0,0.00,0,0.000000,0,...,0,0,0,0,0,0,0,0,0,0
1,1000003930,1,1,1,30000.0,1,2421.00,15,100.000000,0,...,0,0,0,0,0,0,0,0,0,1
2,1000004038,2,1,2,45000.0,2,220.00,3,220.000000,0,...,0,0,0,0,0,0,0,0,0,1
3,1000007540,3,2,3,5000.0,3,1.00,1,1.000000,0,...,0,0,0,0,0,0,0,0,0,1
4,1000011046,4,3,4,19500.0,4,1283.00,14,1283.000000,0,...,0,0,0,0,0,0,0,0,0,1
5,1000014025,5,4,5,50000.0,5,52375.00,224,52375.000000,0,...,0,0,0,0,0,0,0,0,0,1
6,1000023410,6,5,6,1000.0,6,1205.00,16,1205.000000,0,...,0,0,0,0,0,0,0,0,0,1
7,1000030581,7,6,7,25000.0,7,453.00,40,453.000000,0,...,0,0,0,0,0,0,0,0,0,1
8,1000034518,8,7,8,125000.0,8,8233.00,58,8233.000000,0,...,0,0,0,0,0,0,0,0,0,1
9,100004195,9,8,9,65000.0,9,6240.57,43,6240.570000,0,...,0,0,0,0,0,0,0,0,0,1
