In [1]:
import pickle
import pandas as pd
import sklearn as sk
from sklearn import metrics
from sklearn.model_selection import train_test_split

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import DoubleType

# Задача
### Как применить sklearn модель на 1Тб данных?

In [4]:
df = spark.read.parquet('data/iris.parquet')

                                                                                

In [5]:
df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [6]:
si = StringIndexer(inputCol='species', outputCol="type")
df = si.fit(df).transform(df).drop('species')

In [7]:
df.show(5)

+------------+-----------+------------+-----------+----+
|sepal_length|sepal_width|petal_length|petal_width|type|
+------------+-----------+------------+-----------+----+
|         5.1|        3.5|         1.4|        0.2| 0.0|
|         4.9|        3.0|         1.4|        0.2| 0.0|
|         4.7|        3.2|         1.3|        0.2| 0.0|
|         4.6|        3.1|         1.5|        0.2| 0.0|
|         5.0|        3.6|         1.4|        0.2| 0.0|
+------------+-----------+------------+-----------+----+
only showing top 5 rows



In [8]:
pdf = df.toPandas()

In [9]:
pdf.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
5,5.4,3.9,1.7,0.4,0.0
6,4.6,3.4,1.4,0.3,0.0
7,5.0,3.4,1.5,0.2,0.0
8,4.4,2.9,1.4,0.2,0.0
9,4.9,3.1,1.5,0.1,0.0


# DS создал Sklearn модель

In [10]:
train, test = train_test_split(pdf, random_state = 42)

In [11]:
features_col = ['sepal_length','sepal_width','petal_length','petal_width']

In [12]:
X_train = train[features_col]
y_train = train.type
X_test = test[features_col]
y_test = test.type

In [14]:
type(y_train)

pandas.core.series.Series

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
model = DecisionTreeClassifier(max_depth = 3, random_state = 1)

In [17]:
model = model.fit(X_train,y_train)

In [18]:
prediction=model.predict(X_test)
print('The accuracy of the Decision Tree is {:.3f}'.format(metrics.accuracy_score(prediction,y_test)))

The accuracy of the Decision Tree is 1.000


In [19]:
type(model)

sklearn.tree._classes.DecisionTreeClassifier

In [20]:
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)

# Применяем Sklearn модель на больших данных

In [21]:
import pyspark.sql.functions as F

In [22]:
@F.pandas_udf(returnType=DoubleType())
def predict_pandas_udf(*cols):
    import pandas as pd
    # cols will be a tuple of pandas.Series here.
    X = pd.concat(cols, axis=1)
    with open('model.pickle', 'rb') as f:
        load_model = pickle.load(f)
    return pd.Series(load_model.predict(X))

In [23]:
df_result = df.withColumn('result', predict_pandas_udf(*features_col))

In [24]:
df_result.show()

[Stage 6:>                                                          (0 + 1) / 1]

+------------+-----------+------------+-----------+----+------+
|sepal_length|sepal_width|petal_length|petal_width|type|result|
+------------+-----------+------------+-----------+----+------+
|         5.1|        3.5|         1.4|        0.2| 0.0|   0.0|
|         4.9|        3.0|         1.4|        0.2| 0.0|   0.0|
|         4.7|        3.2|         1.3|        0.2| 0.0|   0.0|
|         4.6|        3.1|         1.5|        0.2| 0.0|   0.0|
|         5.0|        3.6|         1.4|        0.2| 0.0|   0.0|
|         5.4|        3.9|         1.7|        0.4| 0.0|   0.0|
|         4.6|        3.4|         1.4|        0.3| 0.0|   0.0|
|         5.0|        3.4|         1.5|        0.2| 0.0|   0.0|
|         4.4|        2.9|         1.4|        0.2| 0.0|   0.0|
|         4.9|        3.1|         1.5|        0.1| 0.0|   0.0|
|         5.4|        3.7|         1.5|        0.2| 0.0|   0.0|
|         4.8|        3.4|         1.6|        0.2| 0.0|   0.0|
|         4.8|        3.0|         1.4| 

                                                                                

In [26]:
type(df_result)

pyspark.sql.dataframe.DataFrame

In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="type", predictionCol="result", metricName="accuracy")

In [29]:
accuracy = evaluator.evaluate(df_result)
print("DecisionTreeClassifier [Accuracy] = %g"% (accuracy))
print("DecisionTreeClassifier [Error] = %g " % (1.0 - accuracy))

DecisionTreeClassifier [Accuracy] = 0.966667
DecisionTreeClassifier [Error] = 0.0333333 
