# 預測網頁是暫時的或是長青的

## 資料準備

In [None]:
global Path
Path = "file:/home/spark/ntcu_workshop/"

In [None]:
row_df = sqlContext.read.format("csv") \
     .option("header", "true") \
     .option("delimiter", "\t") \
     .load(Path+"data/train.tsv")
print row_df.count()    

In [None]:
row_df.select('url','alchemy_category','alchemy_category_score','is_news','label').show(10)

In [None]:
from pyspark.sql.functions import udf
def replace_question(x):
    return ("0" if x=="?" else x)
replace_question= udf(replace_question)

In [None]:
from pyspark.sql.functions import col  
import pyspark.sql.types 

In [None]:
df= row_df.select(
               ['url','alchemy_category' ]+
               [replace_question(col(column)).cast("double").alias(column)  
                for column in row_df.columns[4:] ] )

In [None]:
df.select('url','alchemy_category','alchemy_category_score','is_news','label').show(10)

In [None]:
train_df, test_df = df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

# pipeline前處理元件

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

## 產生StringIndexer

In [None]:
categoryIndexer = StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index")

### 驗證StringIndexer的功能

In [None]:
categoryTransformer=categoryIndexer.fit(df)

In [None]:
for i in range(0,len(categoryTransformer.labels)):
    print str(i)+':'+categoryTransformer.labels[i] 

In [None]:
df1=categoryTransformer.transform(df)

In [None]:
df1.select("alchemy_category","alchemy_category_Index").show(10)

### 產生 OneHotEncoder

In [None]:
encoder = OneHotEncoder(dropLast=False,
                            inputCol='alchemy_category_Index', 
                            outputCol="alchemy_category_IndexVec")

### 驗證OneHotEncoder的功能

In [None]:
df2=encoder.transform(df1)

In [None]:
df2.select("alchemy_category","alchemy_category_Index",
                    "alchemy_category_IndexVec").show(10)

## VectorAssembler


In [None]:
assemblerInputs =['alchemy_category_IndexVec']  +  row_df.columns[4:-1]

In [None]:
assembler = VectorAssembler(inputCols=assemblerInputs,  outputCol="features")

In [None]:
df3=assembler.transform(df2)

### 驗證VectorAssembler的功能

In [None]:
df3.select('features').show(5)

In [None]:
df3.select('features').take(1)

## Decision Tree

In [None]:
dt = DecisionTreeClassifier(labelCol="label",  featuresCol="features",
                                              impurity="gini",maxDepth=10, maxBins=14)

# 建立資料處理的pipeline

In [None]:
pipeline = Pipeline(stages=[categoryIndexer,encoder ,assembler,dt ])

# 使用pipeline進行訓練

In [None]:
pipelineModel = pipeline.fit(train_df)

In [None]:
print pipelineModel.stages[3].toDebugString[:1000]

# 使用pipeline 進行預測

In [None]:
predicted=pipelineModel.transform(test_df)

In [None]:
predicted.columns

In [None]:
predicted.select('url','label','prediction').show(10)

# 評估模型的準確率

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluator = BinaryClassificationEvaluator(
                              rawPredictionCol="rawPrediction",
                              labelCol="label",  
                              metricName="areaUnderROC"  )

In [None]:
predictions =pipelineModel.transform(test_df)
auc= evaluator.evaluate(predictions)
auc