In [1]:
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree

In [2]:
DescDict = {
    0: "暫時性網頁(ephemeral)",
    1: "長青網頁(evergreen)"
}

In [3]:
def extract_features(field,categoriesMap,featureEnd):
    #擷取分類特徵欄位
    categoryIdx = categoriesMap[field[3]] 
    categoryFeatures = np.zeros(len(categoriesMap))
    categoryFeatures[categoryIdx] = 1
    #擷取數值欄位
    numericalFeatures=[convert_float(field)  for  field in field[4: featureEnd]]    
    #回傳「分類特徵欄位」+「數字特徵欄位」
    return  np.concatenate(( categoryFeatures, numericalFeatures))

In [4]:
def convert_float(x):
    return (0 if x=="?" else float(x))

In [5]:
def extract_label(field):
    label=(field[-1])
    return float(label)

In [6]:
def PredictData(sc,model,categoriesMap): 
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile(Path+"data/test.tsv")
    header = rawDataWithHeader.first() 
    rawData = rawDataWithHeader.filter(lambda x:x !=header)    
    rData=rawData.map(lambda x: x.replace("\"", ""))    
    lines = rData.map(lambda x: x.split("\t"))
    print("共計：" + str(lines.count()) + "筆")    
    dataRDD = lines.map(lambda r:  ( r[0]  ,extract_features(r,categoriesMap,len(r) )))

    for data in dataRDD.take(10):
        predictResult = model.predict(data[1])
        print " 網址：  " +str(data[0])+"\n" +\
                  "             ==>預測:"+ str(predictResult)+ \
                  " 說明:"+DescDict[predictResult] +"\n"

# 準備資料

In [7]:
global Path  
Path="file:/home/spark/ntcu_workshop/"

In [8]:
print("開始匯入資料...")
rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
header = rawDataWithHeader.first() 
rawData = rawDataWithHeader.filter(lambda x:x !=header)    
rData=rawData.map(lambda x: x.replace("\"", ""))    
lines = rData.map(lambda x: x.split("\t"))
print("共計：" + str(lines.count()) + "筆")

開始匯入資料...
共計：7395筆


In [9]:
lines.first()[3:]

[u'business',
 u'0.789131',
 u'2.055555556',
 u'0.676470588',
 u'0.205882353',
 u'0.047058824',
 u'0.023529412',
 u'0.443783175',
 u'0',
 u'0',
 u'0.09077381',
 u'0',
 u'0.245831182',
 u'0.003883495',
 u'1',
 u'1',
 u'24',
 u'0',
 u'5424',
 u'170',
 u'8',
 u'0.152941176',
 u'0.079129575',
 u'0']

In [10]:
categoriesMap =lines.map(lambda fields: fields[3]) \
        .distinct().zipWithIndex().collectAsMap()

In [11]:
categoriesMap

{u'?': 6,
 u'arts_entertainment': 13,
 u'business': 1,
 u'computer_internet': 2,
 u'culture_politics': 3,
 u'gaming': 7,
 u'health': 5,
 u'law_crime': 4,
 u'recreation': 0,
 u'religion': 11,
 u'science_technology': 9,
 u'sports': 10,
 u'unknown': 8,
 u'weather': 12}

In [14]:
labelRDD = lines.map( lambda r: extract_label(r))
print labelRDD.take(1)

[0.0]


In [15]:
labelpointRDD = lines.map( lambda r: 
     LabeledPoint(
                extract_label(r),
                extract_features(r,categoriesMap,len(r) - 1)))

In [16]:
print lines.first()[3:]

[u'business', u'0.789131', u'2.055555556', u'0.676470588', u'0.205882353', u'0.047058824', u'0.023529412', u'0.443783175', u'0', u'0', u'0.09077381', u'0', u'0.245831182', u'0.003883495', u'1', u'1', u'24', u'0', u'5424', u'170', u'8', u'0.152941176', u'0.079129575', u'0']


In [17]:
labelpointRDD.take(1)

[LabeledPoint(0.0, [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])]

In [18]:
(trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
print("將資料分trainData:" + str(trainData.count()) +   
          "   validationData:" + str(validationData.count()) + 
          "   testData:" + str(testData.count()))


將資料分trainData:5880   validationData:759   testData:756


In [19]:
trainData.persist()
validationData.persist()
testData.persist()

PythonRDD[19] at RDD at PythonRDD.scala:48

# 訓練模型

In [20]:
model=DecisionTree.trainClassifier( \
        trainData, numClasses=2, categoricalFeaturesInfo={}, \
        impurity="entropy", maxDepth=5, maxBins=5)

# 進行預測

In [21]:
print("==========預測資料===============")
PredictData(sc, model, categoriesMap)

開始匯入資料...
共計：3171筆
 網址：  http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-sauce.html
             ==>預測:1.0 說明:長青網頁(evergreen)

 網址：  http://lolpics.se/18552-stun-grenade-ar
             ==>預測:0.0 說明:暫時性網頁(ephemeral)

 網址：  http://www.xcelerationfitness.com/treadmills.html
             ==>預測:0.0 說明:暫時性網頁(ephemeral)

 網址：  http://www.bloomberg.com/news/2012-02-06/syria-s-assad-deploys-tactics-of-father-to-crush-revolt-threatening-reign.html
             ==>預測:0.0 說明:暫時性網頁(ephemeral)

 網址：  http://www.wired.com/gadgetlab/2011/12/stem-turns-lemons-and-limes-into-juicy-atomizers/
             ==>預測:0.0 說明:暫時性網頁(ephemeral)

 網址：  http://www.latimes.com/health/boostershots/la-heb-fat-tax-denmark-20111013,0,2603132.story
             ==>預測:0.0 說明:暫時性網頁(ephemeral)

 網址：  http://www.howlifeworks.com/a/a?AG_ID=1186&cid=7340ci
             ==>預測:1.0 說明:長青網頁(evergreen)

 網址：  http://romancingthestoveblog.wordpress.com/2010/01/13/sweet-potato-ravioli-with-lemon-sage-brown-butter-sau