In [40]:
spark

In [None]:
# %load ../src/genesis/dataset.py
#!/usr/bin/env python

import os
import pandas as pd
import numpy as np
from names import namesFileParser

class dataset(object):
    """Representation of a dataset

    Each row contains a feature vector.  Continuous and discrete features are
    handled within the vector.  The class labels may also be discrete or 
    continuous, representative of classification or regression, respectively.

    The dataset is read in by processing a CSV file and a .names file
    describing the CSV file contains.  Alternatively, manual specification of
    the .names file contents through arguments to __init__ may be provided.

    Feature vectors containing missing data is also handled.
    """

    def __init__(self, dataFilename, namesFilename=None, attributeNames=None, classNames=None):

        self.hasClassLabels = False  # Not all datasets must have class labels (test set)
        self.attributeNames = attributeNames  # List of attribute names
        self.classNames = classNames  # List of class names
        self.classes = None  # Dataframe of class labels
        self.data = None  # Dataframe of dataset

        # Determine if this is a regression problem
        self.regression = namesFileParser.isRegression(self.classNames)

        self.readData(dataFilename, namesFilename)

    def readData(self, dataFilename, namesFilename):
        """Loads a dataset into memory.

        Loads a dataset consisting of a CSV and names file.  The names file
        describes the CSV, and its specification is found in another module.

        A dataset is suitable for either classification or regression tasks.
        """

        # If the attribute and class names are not present, then they must be
        # read from the names file
        if self.attributeNames is None or self.classNames is None:
            names = namesFileParser(namesFilename)
            self.attributeNames = names.attributes
            self.classNames = names.classes
            self.regression = names.regression

        # load data
        data = self.readCSV(dataFilename)

        # Set the category values for discrete attributes according to the names
        for att in self.attributeNames:

            if att['values'] == 'continuous' or att['values'] == 'ignore':
                continue

            attributeName = att['name']

            data[attributeName].cat.set_categories(att['values'], inplace=True)

        # Categorical data has had its missing values filled in with NaNs.
        # These should be put back to question marks
        for col in data.select_dtypes(include=['category']).columns:
            data[col].fillna(value='?', inplace=True)

        # Record the classes and attributes
        if self.hasClassLabels:
            self.classes = data['target']
            data.drop('target', axis=1, inplace=True)

        # We are now done with the dataset
        self.data = data

    def readCSV(self, dataFilename):
        """Perform the actual loading of the CSV into dataframe.

        Loads data from disk into a pandas dataframe.  The dataframe must
        be supplied with several options in order to successfully read the
        CSV.  These options are described by other functions.

        There are two modes of operation depending on whether or not class
        labels are included with the data.  For training, there must be a
        target variable.  Testing however does not require this.  If data
        fails to be read with a class label, it will retry reading without
        the class label.

        Args:
            dataFilename (str): The CSV file to be read.

        Returns:
            The raw pandas dataframe
        """

        # try to load the data with the class labels
        try:
            data = pd.read_csv(
                self.findFirstFile(dataFilename, ['gz', 'bz2', 'xz', None]),
                header=None,
                names=[x['name'] for x in self.attributeNames] + ['target'],
                dtype=self.getColumnTypes(),
                na_values='?',
                usecols=self.getUsableColumns()
            )

            self.hasClassLabels = True

            return data

        except pd.io.common.CParserError:
            pass

        # try to load the data without the class labels
        data = pd.read_csv(
            self.findFirstFile(dataFilename, ['gz', 'bz2', 'xz', None]),
            header=None,
            names=[x['name'] for x in self.attributeNames],
            dtype=self.getColumnTypes(readClass=False),
            na_values='?',
            usecols=self.getUsableColumns(readClass=False)
        )
        self.hasClassLabels = False
        return data

    def getUsableColumns(self, readClass=True):
        """Returns the column names to read into dataframe.

        If a column is marked ignore in the names file, it can be excluded
        from ever being read into the dataframe.  This function returns
        the names of all columns not marked ignore.

        As test datasets do not strictly require class labels, the decision
        to read the class variable is controlled by the readClass argument.

        Args:
            readClass (bool): True for reading class label, false otherwise.

        Returns:
            A list of attribute names
        """

        # Do not read columns marked ignore
        useCols = list()
        for attribute in self.attributeNames:
            if attribute['values'] != 'ignore':
                useCols.append(attribute['name'])

        # Class labels are not strictly necessary
        if readClass:
            useCols.append('target')

        return useCols

    def getColumnTypes(self, readClass=True):
        """Returns a mapping of attribute name to attribute type.

        Pandas can assign the data type of features upon reading them in.
        This function is used to calculate and assign the feature type based
        upon the information in the names file.

        As test datasets do not strictly require class labels, the decision
        to read the class variable is controlled by the readClass argument.

        Args:
            readClass (bool): True for reading class label, false otherwise.

        Returns:
            A dict indexed by attribute name and containing the pandas column
            type category.
        """
        # Read the attribute type and assign the column type based on
        # discrete or continuous type.  Do not assign anything for columns
        # marked ignore
        dtypeSpecifier = dict()
        for attribute in self.attributeNames:
            attributeName = attribute['name']
            if attribute['values'] == 'continuous':
                dtypeSpecifier[attributeName] = np.float64
            elif attribute['values'] != 'ignore':
                dtypeSpecifier[attributeName] = 'category'

        # Read the class label and assign the column type based on regression
        # or classification
        if readClass:
            if self.regression:
                dtypeSpecifier['target'] = np.float64
            else:
                dtypeSpecifier['target'] = str

        return dtypeSpecifier

    @staticmethod
    def findFirstFile(filestem, extensions):
        """Finds the first file of the name filestem.[extensions]

        Searches disk for a file beginning with the prefix `filestem`, followed
        by a period, and succeeded by one of the extensions in the argument
        extensions.  The extensions are searched in the order they are 
        presented

        Args:
            extensions (list(str)): A list of file extensions to search through

        Returns:
            The name of the file
        """
        for ext in extensions:
            if ext is not None:
                if os.path.exists(filestem + '.' + ext):
                    return filestem + '.' + ext
            else:
                if os.path.exists(filestem):
                    return filestem
        return filestem


In [21]:
    trainFilename = './krk/krk.data'
namesFilename = './krk/krk.names'

In [99]:
trainingSet = dataset(trainFilename, namesFilename)

In [79]:
trainingSet.data.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6']

In [49]:
df = reduce(lambda trainingSet.data, idx: trainingSet.data.withColumnRenamed(oldColumns[idx], newColumns[idx]), xrange(len(oldColumns)), trainingSet.data)
df.printSchema()
df.show()

SyntaxError: invalid syntax (<ipython-input-49-b64e61078a89>, line 1)

In [135]:
data = trainingSet.data.toDF(*(temp_dict[c] for c in trainingSet.data.columns))

In [137]:
data.show()
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline

+---------------------+---------------------+---------------+---------------+---------------+---------------+------+
|white king file (col)|white king rank (row)|white rook file|white rook rank|black king file|blank king rank|target|
+---------------------+---------------------+---------------+---------------+---------------+---------------+------+
|                    a|                    1|              b|              3|              c|              2|  draw|
|                    a|                    1|              c|              1|              c|              2|  draw|
|                    a|                    1|              c|              1|              d|              1|  draw|
|                    a|                    1|              c|              1|              d|              2|  draw|
|                    a|                    1|              c|              2|              c|              1|  draw|
|                    a|                    1|              c|   

In [101]:
type(data)

pyspark.sql.dataframe.DataFrame

In [141]:
indexer = StringIndexer(inputCol="white king file (col)", outputCol="white king file (col) numeric").fit(data)
encoder = OneHotEncoder(inputCol="white king file (col) numeric", outputCol="white king file (col) vector")
assembler = VectorAssembler(inputCols=["white king file (col) vector", "white king rank (row)", "white rook rank"], outputCol="features")
pipeline = Pipeline(stages=[indexer, encoder])
model = pipeline.fit(data)
transformed = model.transform(data)
transformed.show()

+---------------------+---------------------+---------------+---------------+---------------+---------------+------+-----------------------------+----------------------------+
|white king file (col)|white king rank (row)|white rook file|white rook rank|black king file|blank king rank|target|white king file (col) numeric|white king file (col) vector|
+---------------------+---------------------+---------------+---------------+---------------+---------------+------+-----------------------------+----------------------------+
|                    a|                    1|              b|              3|              c|              2|  draw|                          3.0|                   (3,[],[])|
|                    a|                    1|              c|              1|              c|              2|  draw|                          3.0|                   (3,[],[])|
|                    a|                    1|              c|              1|              d|              1|  draw|    

In [107]:
from pyspark.sql.functions import lit
lit(10)

Column<b'10'>

In [108]:
for att in attributeNames:
    if att['values'] == 'continuous' or att['values'] == 'ignore':
        continue
    
    attributeName = att['name']

#     data[attributeName].cat.set_categories(att['values'], inplace=True)
    new_df = data.withColumn(attributeName, lit(att['values']))

Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.functions.lit.
: java.lang.RuntimeException: Unsupported literal type class java.util.ArrayList [a, b, c, d, ?]
	at org.apache.spark.sql.catalyst.expressions.Literal$.apply(literals.scala:77)
	at org.apache.spark.sql.catalyst.expressions.Literal$$anonfun$create$2.apply(literals.scala:163)
	at org.apache.spark.sql.catalyst.expressions.Literal$$anonfun$create$2.apply(literals.scala:163)
	at scala.util.Try.getOrElse(Try.scala:79)
	at org.apache.spark.sql.catalyst.expressions.Literal$.create(literals.scala:162)
	at org.apache.spark.sql.functions$.typedLit(functions.scala:112)
	at org.apache.spark.sql.functions$.lit(functions.scala:95)
	at org.apache.spark.sql.functions.lit(functions.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [76]:
# %load ../src/flask-genesis/genesis/dataset.py
#!/usr/bin/env python

import os
import pandas as pd
import numpy as np
from names import namesFileParser

class dataset(object):
    """Representation of a dataset

    Each row contains a feature vector.  Continuous and discrete features are
    handled within the vector.  The class labels may also be discrete or 
    continuous, representative of classification or regression, respectively.

    The dataset is read in by processing a CSV file and a .names file
    describing the CSV file contains.  Alternatively, manual specification of
    the .names file contents through arguments to __init__ may be provided.

    Feature vectors containing missing data is also handled.
    """

    def __init__(self, dataFilename, namesFilename=None, attributeNames=None, classNames=None):

        self.hasClassLabels = False  # Not all datasets must have class labels (test set)
        self.attributeNames = attributeNames  # List of attribute names
        self.classNames = classNames  # List of class names
        self.classes = None  # Dataframe of class labels
        self.data = None  # Dataframe of dataset

        # Determine if this is a regression problem
        self.regression = namesFileParser.isRegression(self.classNames)

        self.readData(dataFilename, namesFilename)

    def readData(self, dataFilename, namesFilename):
        """Loads a dataset into memory.

        Loads a dataset consisting of a CSV and names file.  The names file
        describes the CSV, and its specification is found in another module.

        A dataset is suitable for either classification or regression tasks.
        """

        # If the attribute and class names are not present, then they must be
        # read from the names file
        if self.attributeNames is None or self.classNames is None:
            names = namesFileParser(namesFilename)
            self.attributeNames = names.attributes
            self.classNames = names.classes
            self.regression = names.regression

        # load data
        data = self.readCSV(dataFilename)

        # Set the category values for discrete attributes according to the names
#         for att in self.attributeNames:

#             if att['values'] == 'continuous' or att['values'] == 'ignore':
#                 continue

#             attributeName = att['name']

#             data[attributeName].cat.set_categories(att['values'], inplace=True)

#         # Categorical data has had its missing values filled in with NaNs.
#         # These should be put back to question marks
#         for col in data.select_dtypes(include=['category']).columns:
#             data[col].fillna(value='?', inplace=True)

#         # Record the classes and attributes
#         if self.hasClassLabels:
#             self.classes = data['target']
#             data.drop('target', axis=1, inplace=True)

        # We are now done with the dataset
        self.data = data

    def readCSV(self, dataFilename):
        """Perform the actual loading of the CSV into dataframe.

        Loads data from disk into a pandas dataframe.  The dataframe must
        be supplied with several options in order to successfully read the
        CSV.  These options are described by other functions.

        There are two modes of operation depending on whether or not class
        labels are included with the data.  For training, there must be a
        target variable.  Testing however does not require this.  If data
        fails to be read with a class label, it will retry reading without
        the class label.

        Args:
            dataFilename (str): The CSV file to be read.

        Returns:
            The raw pandas dataframe
        """

        # try to load the data with the class labels
        try:
#             data = pd.read_csv(
#                 self.findFirstFile(dataFilename, ['gz', 'bz2', 'xz', None]),
#                 header=None,
#                 names=[x['name'] for x in self.attributeNames] + ['target'],
#                 dtype=self.getColumnTypes(),
#                 na_values='?',
#                 usecols=self.getUsableColumns()
#             )

            data = spark.read.option("header", None).option("names", [x['name'] for x in self.attributeNames] + ['target']).option("dtype", self.getColumnTypes()).option("na_values", "?").option("usecols", self.getUsableColumns()).csv(trainFilename)

            self.hasClassLabels = True

            return data

        except pd.io.common.CParserError:
            pass

        # try to load the data without the class labels
#         data = pd.read_csv(
#             self.findFirstFile(dataFilename, ['gz', 'bz2', 'xz', None]),
#             header=None,
#             names=[x['name'] for x in self.attributeNames],
#             dtype=self.getColumnTypes(readClass=False),
#             na_values='?',
#             usecols=self.getUsableColumns(readClass=False)
#         )

        data = spark.read.option("header", None).option("names", [x['name'] for x in self.attributeNames]).option("dtype", self.getColumnTypes()).option("na_values", "?").option("usecols", self.getUsableColumns()).csv(trainFilename)
        
        self.hasClassLabels = False
        return data

    def getUsableColumns(self, readClass=True):
        """Returns the column names to read into dataframe.

        If a column is marked ignore in the names file, it can be excluded
        from ever being read into the dataframe.  This function returns
        the names of all columns not marked ignore.

        As test datasets do not strictly require class labels, the decision
        to read the class variable is controlled by the readClass argument.

        Args:
            readClass (bool): True for reading class label, false otherwise.

        Returns:
            A list of attribute names
        """

        # Do not read columns marked ignore
        useCols = list()
        for attribute in self.attributeNames:
            if attribute['values'] != 'ignore':
                useCols.append(attribute['name'])

        # Class labels are not strictly necessary
        if readClass:
            useCols.append('target')

        return useCols

    def getColumnTypes(self, readClass=True):
        """Returns a mapping of attribute name to attribute type.

        Pandas can assign the data type of features upon reading them in.
        This function is used to calculate and assign the feature type based
        upon the information in the names file.

        As test datasets do not strictly require class labels, the decision
        to read the class variable is controlled by the readClass argument.

        Args:
            readClass (bool): True for reading class label, false otherwise.

        Returns:
            A dict indexed by attribute name and containing the pandas column
            type category.
        """
        # Read the attribute type and assign the column type based on
        # discrete or continuous type.  Do not assign anything for columns
        # marked ignore
        dtypeSpecifier = dict()
        for attribute in self.attributeNames:
            attributeName = attribute['name']
            if attribute['values'] == 'continuous':
                dtypeSpecifier[attributeName] = np.float64
            elif attribute['values'] != 'ignore':
                dtypeSpecifier[attributeName] = 'category'

        # Read the class label and assign the column type based on regression
        # or classification
        if readClass:
            if self.regression:
                dtypeSpecifier['target'] = np.float64
            else:
                dtypeSpecifier['target'] = str

        return dtypeSpecifier

    @staticmethod
    def findFirstFile(filestem, extensions):
        """Finds the first file of the name filestem.[extensions]

        Searches disk for a file beginning with the prefix `filestem`, followed
        by a period, and succeeded by one of the extensions in the argument
        extensions.  The extensions are searched in the order they are 
        presented

        Args:
            extensions (list(str)): A list of file extensions to search through

        Returns:
            The name of the file
        """
        for ext in extensions:
            if ext is not None:
                if os.path.exists(filestem + '.' + ext):
                    return filestem + '.' + ext
            else:
                if os.path.exists(filestem):
                    return filestem
        return filestem


In [28]:
pyspark_df = spark.read.option("header", None).csv(trainFilename)

In [31]:
names = namesFileParser(namesFilename)

In [33]:
attributeNames = names.attributes
classNames = names.classes
regression = names.regression

In [87]:
temp_names = []
for att in attributeNames:
    temp_names.append(att['name'])
temp_names.append('target')

In [109]:
attributeNames

[{'name': 'white king file (col)', 'values': ['a', 'b', 'c', 'd', '?']},
 {'name': 'white king rank (row)', 'values': ['1', '2', '3', '4', '?']},
 {'name': 'white rook file',
  'values': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', '?']},
 {'name': 'white rook rank',
  'values': ['1', '2', '3', '4', '5', '6', '7', '8', '?']},
 {'name': 'black king file',
  'values': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', '?']},
 {'name': 'blank king rank',
  'values': ['1', '2', '3', '4', '5', '6', '7', '8', '?']}]

In [88]:
newColumns = temp_names
oldColumns = trainingSet.data.columns

In [82]:
newColumns = newColumns.append('target')

In [89]:
newColumns

['white king file (col)',
 'white king rank (row)',
 'white rook file',
 'white rook rank',
 'black king file',
 'blank king rank',
 'target']

In [90]:
temp_dict = dict(zip(oldColumns, newColumns))

In [110]:
import pandas as pd

In [130]:
df = pd.DataFrame({'A' : np.random.randn(9), 'B' : pd.Series(list('aabbcd ab')).astype('category')})

In [131]:
df

Unnamed: 0,A,B
0,1.643247,a
1,-0.84061,a
2,-0.072254,b
3,0.341142,b
4,-1.031222,c
5,1.388554,d
6,0.86091,
7,2.222105,a
8,1.570079,b


In [134]:
df.B.cat.set_categories(['a', 'b', 'c', 'd', '?'])

0      a
1      a
2      b
3      b
4      c
5      d
6    NaN
7      a
8      b
dtype: category
Categories (5, object): [a, b, c, d, ?]