In [37]:
import glob
from os import path

In [38]:
ham_files = sc.parallelize(glob.glob("../../enron1/ham/*.txt"),10)
spam_files = sc.parallelize(glob.glob("../../enron1/spam/*.txt"),10)

In [39]:
def file_to_str(filename):
    with open(filename,'r') as f:
        return f.read()

ham = ham_files.map(lambda f: (path.basename(f), file_to_str(f)))
spam = spam_files.map(lambda f: (path.basename(f), file_to_str(f)))

In [40]:
from pyspark.sql import *
df_ham = ham.map(lambda (f,t):Row(label=0.0, filename=f, text=t.split())).toDF().cache()
df_spam = spam.map(lambda (f,t):Row(label=1.0, filename=f, text=t.split())).toDF().cache()
df_data = df_ham.union(df_spam)
df_data.show(5)

+--------------------+-----+--------------------+
|            filename|label|                text|
+--------------------+-----+--------------------+
|0001.1999-12-10.f...|  0.0|[Subject:, christ...|
|0002.1999-12-13.f...|  0.0|[Subject:, vastar...|
|0003.1999-12-14.f...|  0.0|[Subject:, calpin...|
|0004.1999-12-14.f...|  0.0|[Subject:, re, :,...|
|0005.1999-12-14.f...|  0.0|[Subject:, meter,...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [41]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(numFeatures=5, inputCol="text", outputCol="tf")
df_data_tf = hashingTF.transform(df_data)
df_data_tf.show(5)

+--------------------+-----+--------------------+--------------------+
|            filename|label|                text|                  tf|
+--------------------+-----+--------------------+--------------------+
|0001.1999-12-10.f...|  0.0|[Subject:, christ...|(5,[1,2,3,4],[2.0...|
|0002.1999-12-13.f...|  0.0|[Subject:, vastar...|(5,[0,1,2,3,4],[1...|
|0003.1999-12-14.f...|  0.0|[Subject:, calpin...|(5,[0,1,2,4],[2.0...|
|0004.1999-12-14.f...|  0.0|[Subject:, re, :,...|(5,[0,1,2,3,4],[4...|
|0005.1999-12-14.f...|  0.0|[Subject:, meter,...|(5,[0,1,2,3,4],[4...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [42]:
idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
model = idf.fit(df_data_tf)
df_data_tf_idf = model.transform(df_data_tf)
df_data_tf_idf.show(5)

+--------------------+-----+--------------------+--------------------+--------------------+
|            filename|label|                text|                  tf|                 idf|
+--------------------+-----+--------------------+--------------------+--------------------+
|0001.1999-12-10.f...|  0.0|[Subject:, christ...|(5,[1,2,3,4],[2.0...|(5,[1,2,3,4],[0.0...|
|0002.1999-12-13.f...|  0.0|[Subject:, vastar...|(5,[0,1,2,3,4],[1...|(5,[0,1,2,3,4],[1...|
|0003.1999-12-14.f...|  0.0|[Subject:, calpin...|(5,[0,1,2,4],[2.0...|(5,[0,1,2,4],[0.0...|
|0004.1999-12-14.f...|  0.0|[Subject:, re, :,...|(5,[0,1,2,3,4],[4...|(5,[0,1,2,3,4],[0...|
|0005.1999-12-14.f...|  0.0|[Subject:, meter,...|(5,[0,1,2,3,4],[4...|(5,[0,1,2,3,4],[0...|
+--------------------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [43]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

(trainData, testData) = df_data_tf_idf.randomSplit([0.5, 0.5])

In [44]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'idf')
model = rf.fit(testData)

In [45]:
test_predictions = model.transform(testData)
test_predictions.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|            filename|label|                text|                  tf|                 idf|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|0002.1999-12-13.f...|  0.0|[Subject:, vastar...|(5,[0,1,2,3,4],[1...|(5,[0,1,2,3,4],[1...|[12.5720286448161...|[0.62860143224080...|       0.0|
|0003.1999-12-14.f...|  0.0|[Subject:, calpin...|(5,[0,1,2,4],[2.0...|(5,[0,1,2,4],[0.0...|[15.3284586218421...|[0.76642293109210...|       0.0|
|0005.1999-12-14.f...|  0.0|[Subject:, meter,...|(5,[0,1,2,3,4],[4...|(5,[0,1,2,3,4],[0...|[15.8818467289343...|[0.79409233644671...|       0.0|
|0007.1999-12-14.f...|  0.0|[Subject:, mcmull...|(5,[0,1,2,3,4],[3...|(5,[0,1,2,3,4],[0...|[13.4509241509410...|[0.67254620754705.

In [46]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(test_predictions)

0.7344332232268187