In [1]:
spark

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
import numpy as np
from dataset import dataset
from dummyEncoder import dummyEncoder

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: wordcount <file>", file=sys.stderr)
        exit(-1)

    spark = SparkSession\
        .builder\
        .appName("PythonWordCount")\
        .getOrCreate()

    sampleData = 'data/krk/krk'
    trainFilename = sampleData + '.data'
    namesFilename = sampleData + '.names'

#     import gc
#     import numpy as np
#     from dataset import dataset
#     # from options import getOptions
#     # from learningModel import learningModel
#     from dummyEncoder import dummyEncoder

    trainingSet = dataset(trainFilename, namesFilename)
    trainingSet.data.head()

    from pyspark.ml.linalg import Vectors

    de = dummyEncoder()
    temp = de.fit_transform(trainingSet.data)
    temp = map(lambda arr: Vectors.dense(arr), temp)
    data = spark.createDataFrame(zip(trainingSet.classes, temp),schema=["label", "features"])

    (trainingData, testData) = data.randomSplit([0.7, 0.3])

#     from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#     from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
#     from pyspark.ml.classification import RandomForestClassifier
#     from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
#     from pyspark.ml import Pipeline

    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(data)

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

    rf = RandomForestClassifier(labelCol="indexedLabel", \
                                featuresCol="features", \
                                numTrees=30, \
                                featureSubsetStrategy='onethird',
                                maxDepth=30,
                                impurity='gini')

    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    pipeline = Pipeline(stages=[featureIndexer, labelIndexer, rf, labelConverter])

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

    paramGrid = ParamGridBuilder().build()

    crossval = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=5)

    # Train model.  This also runs the indexers.
    model = crossval.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    predictions.show()

    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))
    
    spark.stop()

#     gc.collect()

In [None]:
# !ls -al

# import numpy as np
# from dataset import dataset
# from options import getOptions
# from learningModel import learningModel
# from dummyEncoder import dummyEncoder

# trainingSet = dataset(trainFilename, namesFilename)

# trainingSet.data.head()

# import pickle
# import gzip
# import sys
# import numpy as np
# from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
# from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
# from sklearn.ensemble import BaggingClassifier, BaggingRegressor
# from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
# from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
# from sklearn.linear_model import LogisticRegression, LinearRegression
# from sklearn.calibration import calibration_curve
# import matplotlib.pyplot as plt
# from sklearn.svm import SVC, SVR
# from sklearn.svm import NuSVC, NuSVR
# from sklearn.svm import LinearSVC, LinearSVR
# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
# from sklearn.model_selection import StratifiedKFold, LeaveOneOut, KFold
# from sklearn.metrics import confusion_matrix, accuracy_score, explained_variance_score
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# from sklearn.metrics import recall_score, precision_score, f1_score, log_loss
# from sklearn.preprocessing import Imputer
# # from xgboost import XGBClassifier
# from sklearn.pipeline import Pipeline
# from dummyEncoder import dummyEncoder

# classifier = GradientBoostingClassifier(n_estimators=1, \
# #                                    nthread=-1, \
#                                    random_state=None, \
#                                    max_depth=None)

# model = Pipeline([('data', dummyEncoder()), \
#                   ('imputer', Imputer(missing_values='NaN', strategy='median', axis=0, verbose=100)), \
#                   ('classifier', classifier)])

# kFold = KFold(n_splits=2, shuffle=True, random_state=None)
# groundTruth = trainingSet.classes
# datasetPredictions = [None] * len(groundTruth)
# for trainIndices, testIndices in kFold.split(trainingSet.data, trainingSet.classes):
#     trainX = trainingSet.data.ix[trainIndices, :]
#     trainY = trainingSet.classes[trainIndices]
#     testX = trainingSet.data.ix[testIndices, :]
#     testY = trainingSet.classes[testIndices]

#     # Learn the model
#     model.fit(trainX, trainY)
    
#     foldPredictions = model.predict(testX)
    
#     for i, v in enumerate(testIndices):
#         datasetPredictions[v] = foldPredictions[i]
    

# import pandas as pd
# new_df = pd.concat((trainingSet.data, trainingSet.classes), axis=1)
# new_df.head()

# type(trainingSet.classes)

# from pyspark.ml.linalg import Vectors

# de = dummyEncoder()
# temp = de.fit_transform(trainingSet.data)
# temp = map(lambda arr: Vectors.dense(arr), temp)
# data = spark.createDataFrame(zip(trainingSet.classes, temp),schema=["label", "features"])

# # data.show()

# # mydf.show()
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
# from pyspark.ml.classification import RandomForestClassifier
# from pyspark.ml import Pipeline

# labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# featureIndexer =\
#     VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(data)
# (trainingData, testData) = data.randomSplit([0.8, 0.2])
# rf = RandomForestClassifier(labelCol="indexedLabel", \
#                             featuresCol="features", \
#                             numTrees=30, \
#                             featureSubsetStrategy='onethird',
#                             maxDepth=12,
#                             impurity='entropy')
# labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
#                                labels=labelIndexer.labels)

# pipeline = Pipeline(stages=[featureIndexer, labelIndexer, rf, labelConverter])
# # pipeline = Pipeline(stages=[featureIndexer, rf])


# # Train model.  This also runs the indexers.
# model = pipeline.fit(trainingData)

# # Make predictions.
# predictions = model.transform(testData)

# # Select example rows to display.
# # predictions.select("predictedLabel", "label", "features").show(5)

# # Select (prediction, true label) and compute test error
# evaluator = MulticlassClassificationEvaluator(
#     labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
# accuracy = evaluator.evaluate(predictions)
# print("Test Error = %g" % (1.0 - accuracy))



# from pyspark.ml.classification import RandomForestClassifier

# rf = RandomForestClassifier(labelCol="indexed", featuresCol="features", numTrees=100)

# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
# labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
#                                labels=labelIndexer.labels)
# model = rf.fit()

# [type(ii) for ii in temp]

# from pyspark.ml.linalg import Vectors

# # dff = map(lambda x: Vectors.dense(x[:]), temp)
# dff = spark.sparkContext.parallelize(temp)
# dff.take(5)
# dff = dff.map(lambda x: )
# # dff.first() == temp[0]
# mydf = spark.createDataFrame(dff,schema=["features"])

# from pyspark.mllib.tree import RandomForest, RandomForestModel
# from pyspark.mllib.util import MLUtils

# # Load and parse the data file into an RDD of LabeledPoint.
# data = MLUtils.loadLibSVMFile(sc, '../../jupyter-notebooks/datasets/sample_libsvm_data.txt')
# # Split the data into training and test sets (30% held out for testing)
# (trainingData, testData) = data.randomSplit([0.7, 0.3])


# testData.map(lambda x: x.label).take(5)

# f = urllib.request.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz", "kddcup.data.gz")
# data_file = "./kddcup.data.gz"
# raw_data = sc.textFile(data_file)
# print("Train data size is {}".format(raw_data.count()))
# ft = urllib.request.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz", "corrected.gz")
# test_data_file = "./corrected.gz"
# test_raw_data = sc.textFile(test_data_file)
# print ("Test data size is {}".format(test_raw_data.count()))

# from pyspark.ml.feature import VectorIndexer

# data = spark.read.format("libsvm").load("../../jupyter-notebooks/datasets/sample_libsvm_data.txt")

# indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
# indexerModel = indexer.fit(data)

# categoricalFeatures = indexerModel.categoryMaps
# print("Chose %d categorical features: %s" %
#       (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))

# # Create new column "indexed" with categorical values transformed to indices
# indexedData = indexerModel.transform(data)
# indexedData.show()

# gc.collect()