# 預測網頁是暫時的或是長青的


### 暫時性網頁: 例如流行服飾資料、當日股市新聞，只有某一段時間讀者有興趣，過了這段時間後，就沒有興趣

### 長青性網頁：理財觀念、育兒知識等，不管過多久，讀者都有趣趣

### 某些廣告投放公司的工作，就是負責把相關的資訓投遞給使用者，若無法區別暫時性或是長青性網頁，很有可能使用者得到的資訊是過時的，因此，可以透過機器學習的方式，來預測網站的特性。

In [None]:
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree

In [None]:
DescDict = {
    0: "暫時性網頁(ephemeral)",
    1: "長青網頁(evergreen)"
}

## Note: we need some utility function to hanlde RDD

In [None]:
# 建立特徵向量
def extract_features(field,categoriesMap,featureEnd):
    #擷取分類特徵欄位
    categoryIdx = categoriesMap[field[3]] 
    categoryFeatures = np.zeros(len(categoriesMap))
    categoryFeatures[categoryIdx] = 1
    #擷取數值欄位
    numericalFeatures=[convert_float(field)  for  field in field[4: featureEnd]]    
    #回傳「分類特徵欄位」+「數字特徵欄位」
    return  np.concatenate(( categoryFeatures, numericalFeatures))

In [None]:
def convert_float(x):
    return (0 if x=="?" else float(x))

In [None]:
def extract_label(field):
    label=(field[-1])
    return float(label)

In [None]:
def PredictData(sc,model,categoriesMap): 
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile(Path+"data/test.tsv")
    header = rawDataWithHeader.first() 
    rawData = rawDataWithHeader.filter(lambda x:x !=header)    
    rData=rawData.map(lambda x: x.replace("\"", ""))    
    lines = rData.map(lambda x: x.split("\t"))
    print("共計：" + str(lines.count()) + "筆")    
    dataRDD = lines.map(lambda r:  ( r[0]  ,extract_features(r,categoriesMap,len(r) )))

    for data in dataRDD.take(10):
        predictResult = model.predict(data[1])
        print " 網址：  " +str(data[0])+"\n" +\
                  "             ==>預測:"+ str(predictResult)+ \
                  " 說明:"+DescDict[predictResult] +"\n"

# 準備資料

In [None]:
global Path  
Path="file:/home/spark/ntcu_workshop/"

In [None]:
print("開始匯入資料...")
rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
header = rawDataWithHeader.first() 
rawData = rawDataWithHeader.filter(lambda x:x !=header)  

In [None]:
rData=rawData.map(lambda x: x.replace("\"", ""))    
lines = rData.map(lambda x: x.split("\t"))
print("共計：" + str(lines.count()) + "筆\n")


print ( "資料格式: " + str(lines.first()))

In [None]:
# 將 alchemy_category 轉為index
categoriesMap =lines.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()

In [None]:
categoriesMap

In [None]:
labelpointRDD = lines.map( lambda r: LabeledPoint(
                extract_label(r), extract_features(r,categoriesMap,len(r) - 1)))

### map()將原本為文字欄位轉成一個特徵向量

business --> [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]

In [None]:
print lines.first()[3:]

In [None]:
labelpointRDD.take(1)

In [None]:
(trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
print("將資料分trainData:" + str(trainData.count()) +   
          "   validationData:" + str(validationData.count()) + 
          "   testData:" + str(testData.count()))


In [None]:
trainData.persist()
validationData.persist()
testData.persist()

# 訓練模型

In [None]:
model=DecisionTree.trainClassifier( \
        trainData, numClasses=2, categoricalFeaturesInfo={}, \
        impurity="entropy", maxDepth=5, maxBins=5)

# 進行預測

In [None]:
print("==========預測資料===============")
PredictData(sc, model, categoriesMap)