In [3]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName('Python Spark SQL basic example')\
        .config('spark.some.config.option', 'some-value')\
        .getOrCreate()

### Create json file using spark
# SparkContext로 객체 생성
sc = spark.sparkContext

In [13]:
import pyspark.ml.feature as ft 
from pyspark.ml import Pipeline

## 피처 추출
### NLP 관련된 피처 추출
NGram 모델은 분리된 텍스트 리스트를 입력으로 받아서 단어 쌍(n-grams)를 출력한다.

In [4]:
text_data = spark.createDataFrame([
    ['''Machine learning can be applied to a wide variety 
        of data types, such as vectors, text, images, and 
        structured data. This API adopts the DataFrame from 
        Spark SQL in order to support a variety of data types.'''],
    ['''DataFrame supports many basic and structured types; 
        see the Spark SQL datatype reference for a list of 
        supported types. In addition to the types listed in 
        the Spark SQL guide, DataFrame can use ML Vector types.'''],
    ['''A DataFrame can be created either implicitly or 
        explicitly from a regular RDD. See the code examples 
        below and the Spark SQL programming guide for examples.'''],
    ['''Columns in a DataFrame are named. The code examples 
        below use names such as "text," "features," and "label."''']
], ['input'])

텍스트를 분리하기 위해 RegexTokenizer 사용

In [5]:
tokenizer = ft.RegexTokenizer(
    inputCol='input',
    outputCol='input_arr',
    pattern='\s+|[,.\"]')

In [9]:
# stopwords
stopwords = ft.StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol='input_stop'
)

In [14]:
ngram = ft.NGram(n=2, inputCol=stopwords.getOutputCol(), outputCol='nGrams')

pipeline = Pipeline(stages=[tokenizer, stopwords, ngram])

In [15]:
data_ngram = pipeline.fit(text_data).transform(text_data)

data_ngram.select('nGrams').take(1)

[Row(nGrams=['machine learning', 'learning applied', 'applied wide', 'wide variety', 'variety data', 'data types', 'types vectors', 'vectors text', 'text images', 'images structured', 'structured data', 'data api', 'api adopts', 'adopts dataframe', 'dataframe spark', 'spark sql', 'sql order', 'order support', 'support variety', 'variety data', 'data types'])]

### 연속 변수 분별하기

In [18]:
import numpy as np
import pyspark.sql.types as typ

x = np.arange(0, 100)
x = x / 100.0 * np.pi * 4
y = x * np.sin(x / 1.764) + 20.1234

schema = typ.StructType([
    typ.StructField('continuous_var', 
                    typ.DoubleType(), 
                    False
   )
])

data = spark.createDataFrame([[float(e), ] for e in y], schema=schema)


In [19]:
# 연속 변수들을 다섯 개의 버킷으로 쪼개기
discretizer = ft.QuantileDiscretizer(
    numBuckets=5, 
    inputCol='continuous_var', 
    outputCol='discretized')

In [20]:
data_discretized = discretizer.fit(data).transform(data)

### 연속 변수 표준화 하기

In [22]:
vectorizer = ft.VectorAssembler(
    inputCols=['continuous_var'], 
    outputCol= 'continuous_vec')

In [23]:
normalizer = ft.StandardScaler(
    inputCol=vectorizer.getOutputCol(), 
    outputCol='normalized', 
    withMean=True,
    withStd=True
)

pipeline = Pipeline(stages=[vectorizer, normalizer])
data_standardized = pipeline.fit(data).transform(data)