In [1]:
#Make MongoDB connection
from pymongo import MongoClient
client = MongoClient('compute-0-11', 27017)

db = client.FacebookChallenge_akar1
collection1 = db.fb_hw
collection2  = db.fb_hw_test
contentsTrain = collection1.find().limit(10000)
contentsTest = collection2.find().limit(10000)

In [2]:
#Create spark connector
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext

spark = SparkSession.builder \
        .appName("Facebook_nebaditn") \
        .getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [3]:

from bson import json_util, ObjectId
import json
rddSan1 = json.loads(json_util.dumps(contentsTrain))
rddSan2 = json.loads(json_util.dumps(contentsTest))

In [4]:
#Generate RDD
rddTrain = sc.parallelize(rddSan1)
rddTest = sc.parallelize(rddSan2)

In [5]:
#Create schema to generate train and test dataframe
schemaTrain = StructType([StructField("Body", StringType(), True), 
                     StructField("Id", IntegerType(), True), 
                     StructField("Tags", StringType(), True),
                     StructField("Title", StringType(), True),
                     StructField("_id", StringType(), True)])
schemaTest = StructType([StructField("Body", StringType(), True), 
                     StructField("Id", IntegerType(), True), 
                     StructField("Title", StringType(), True),
                     StructField("_id", StringType(), True)])

train = sqlContext.createDataFrame(rddTrain, schema=schemaTrain)
test = sqlContext.createDataFrame(rddTest, schema=schemaTest)

In [6]:
#Print and check the data
train.show()
test.show()

+--------------------+---+--------------------+--------------------+--------------------+
|                Body| Id|                Tags|               Title|                 _id|
+--------------------+---+--------------------+--------------------+--------------------+
|<p>I'd like to ch...|  1|php image-process...|How to check if a...|{$oid=5a047973393...|
|<p>In my favorite...|  2|             firefox|How can I prevent...|{$oid=5a047973393...|
|<p>This is probab...|  4|     c# url encoding|How do I replace ...|{$oid=5a047973393...|
|<pre><code>functi...|  5|php api file-get-...|How to modify who...|{$oid=5a047973393...|
|<p>I am using a m...|  6|proxy active-dire...|setting proxy in ...|{$oid=5a047973393...|
|<p>My image is ca...|  7|           core-plot|How to draw barpl...|{$oid=5a047973393...|
|<p>I've decided t...|  8|c# asp.net window...|How to fetch an X...|{$oid=5a047973393...|
|<p>Do you know of...|  9|.net javascript c...|.NET library for ...|{$oid=5a047973393...|
|<p>I'm us

In [7]:
# Remove HTML Tags from the Body text
#Here I have used a library beautiful soup to remove the HTML tags from the body section in train and test
from bs4 import BeautifulSoup
from pyspark.sql.functions import udf
from pyspark.sql.types import *

train = train.rdd.map(lambda x: (x[0], x[1], x[2], x[3], x[4], BeautifulSoup(x[0]).text))
train = train.toDF()
test = test.rdd.map(lambda x: (x[0], x[1], x[2], x[3], BeautifulSoup(x[0]).text))
test =  test.toDF()

In [8]:
#Check the new dataframe 
train.show()
test.show()

+--------------------+---+--------------------+--------------------+--------------------+--------------------+
|                  _1| _2|                  _3|                  _4|                  _5|                  _6|
+--------------------+---+--------------------+--------------------+--------------------+--------------------+
|<p>I'd like to ch...|  1|php image-process...|How to check if a...|{$oid=5a047973393...|I'd like to check...|
|<p>In my favorite...|  2|             firefox|How can I prevent...|{$oid=5a047973393...|In my favorite ed...|
|<p>This is probab...|  4|     c# url encoding|How do I replace ...|{$oid=5a047973393...|This is probably ...|
|<pre><code>functi...|  5|php api file-get-...|How to modify who...|{$oid=5a047973393...|function modify(....|
|<p>I am using a m...|  6|proxy active-dire...|setting proxy in ...|{$oid=5a047973393...|I am using a mach...|
|<p>My image is ca...|  7|           core-plot|How to draw barpl...|{$oid=5a047973393...|My image is canno...|
|

In [9]:
#Filter columns from the data frame
workingTrain = train.selectExpr("_2 as ID", "_6 as Body", "_3 as Tags", "_4 as Title")
workingTest = test.selectExpr("_2 as ID", "_5 as Body", "_3 as Title")

In [10]:
#Again check the data frame
workingTrain.show()
workingTest.show()

+---+--------------------+--------------------+--------------------+
| ID|                Body|                Tags|               Title|
+---+--------------------+--------------------+--------------------+
|  1|I'd like to check...|php image-process...|How to check if a...|
|  2|In my favorite ed...|             firefox|How can I prevent...|
|  4|This is probably ...|     c# url encoding|How do I replace ...|
|  5|function modify(....|php api file-get-...|How to modify who...|
|  6|I am using a mach...|proxy active-dire...|setting proxy in ...|
|  7|My image is canno...|           core-plot|How to draw barpl...|
|  8|I've decided to c...|c# asp.net window...|How to fetch an X...|
|  9|Do you know of a ...|.net javascript c...|.NET library for ...|
| 10|I'm using SQL Ser...|sql variables par...|SQL Server : proc...|
| 11|Some commercial o...|.net obfuscation ...|How do commercial...|
| 13|how can I move In...|postfix migration...|Migrate from Mdae...|
|  3|I am import matla...|r matlab

In [11]:
#Select particular columns
workingTrain = workingTrain.selectExpr("Body", "Title", "Tags")
workingTest = workingTest.selectExpr("ID", "Title", "Body")

In [12]:
workingTrain.show()
workingTest.show()

+--------------------+--------------------+--------------------+
|                Body|               Title|                Tags|
+--------------------+--------------------+--------------------+
|I'd like to check...|How to check if a...|php image-process...|
|In my favorite ed...|How can I prevent...|             firefox|
|This is probably ...|How do I replace ...|     c# url encoding|
|function modify(....|How to modify who...|php api file-get-...|
|I am using a mach...|setting proxy in ...|proxy active-dire...|
|My image is canno...|How to draw barpl...|           core-plot|
|I've decided to c...|How to fetch an X...|c# asp.net window...|
|Do you know of a ...|.NET library for ...|.net javascript c...|
|I'm using SQL Ser...|SQL Server : proc...|sql variables par...|
|Some commercial o...|How do commercial...|.net obfuscation ...|
|how can I move In...|Migrate from Mdae...|postfix migration...|
|I am import matla...|R Error Invalid t...|r matlab machine-...|
|This may sound li...|Cra

In [13]:
#Remove the stop words from the body
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
#Tokenize the body column
tokenizer = Tokenizer(inputCol = "Body" , outputCol = "tokenizedBody")
workingTrain = tokenizer.transform(workingTrain) # For the train data
workingTest = tokenizer.transform(workingTest) # For the train data
remover = StopWordsRemover(inputCol = "tokenizedBody", outputCol = "filteredCol")
workingTrain = remover.transform(workingTrain) #For Train data
workingTest = remover.transform(workingTest) #For Test data

In [14]:
workingTrain.show()
workingTest.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                Body|               Title|                Tags|       tokenizedBody|         filteredCol|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|I'd like to check...|How to check if a...|php image-process...|[i'd, like, to, c...|[like, check, upl...|
|In my favorite ed...|How can I prevent...|             firefox|[in, my, favorite...|[favorite, editor...|
|This is probably ...|How do I replace ...|     c# url encoding|[this, is, probab...|[probably, simple...|
|function modify(....|How to modify who...|php api file-get-...|[function, modify...|[function, modify...|
|I am using a mach...|setting proxy in ...|proxy active-dire...|[i, am, using, a,...|[using, machine, ...|
|My image is canno...|How to draw barpl...|           core-plot|[my, image, is, c...|[image, post, , l...|
|I've decided to c...|How to fetch an

In [15]:
#Tokenize the Tags
tagTokenizer = Tokenizer(inputCol="Tags", outputCol="tokenizedTags")
workingTrain = tagTokenizer.transform(workingTrain)
#workingTrain.show()

In [16]:
#Tokenize the Title
titleTokenizer = Tokenizer(inputCol="Title", outputCol="tokenizedTitle")
workingTrain = titleTokenizer.transform(workingTrain)
workingTest = titleTokenizer.transform(workingTest)

In [17]:
#Remove stop words from the Tokenized title
titleRemover = StopWordsRemover(inputCol = "tokenizedTitle", outputCol = "filteredTitle")
workingTrain = titleRemover.transform(workingTrain) #For Train data
workingTest = titleRemover.transform(workingTest) #For Test data

In [18]:
workingTrain = workingTrain.selectExpr("filteredCol as tokenizedBody", "filteredTitle as tokenizedTitle","tokenizedTags")
workingTest = workingTest.selectExpr("ID", "filteredCol as tokenizedBody", "filteredTitle as tokenizedTitle")

In [19]:
#Check the Dataframe
workingTrain.show()
workingTest.show()

+--------------------+--------------------+--------------------+
|       tokenizedBody|      tokenizedTitle|       tokenizedTags|
+--------------------+--------------------+--------------------+
|[like, check, upl...|[check, uploaded,...|[php, image-proce...|
|[favorite, editor...|[prevent, firefox...|           [firefox]|
|[probably, simple...|[replace, special...| [c#, url, encoding]|
|[function, modify...|[modify, whois, c...|[php, api, file-g...|
|[using, machine, ...|[setting, proxy, ...|[proxy, active-di...|
|[image, post, , l...|[draw, barplot, w...|         [core-plot]|
|[decided, convert...|[fetch, xml, feed...|[c#, asp.net, win...|
|[know, .net, libr...|[.net, library, g...|[.net, javascript...|
|[using, sql, serv...|[sql, server, :, ...|[sql, variables, ...|
|[commercial, obfu...|[commercial, obfu...|[.net, obfuscatio...|
|[move, inbox/sent...|[migrate, mdaemon...|[postfix, migrati...|
|[import, matlab, ...|[r, error, invali...|[r, matlab, machi...|
|[may, sound, like...|[cr

In [20]:
#The intuition behind applying ngrams was to try the body and tag with uni and bigram, but due to slow cluster, this step was not completing.
#Limiting it to unigram.
from pyspark.ml.feature import NGram
ngramTags = NGram(n=1, inputCol="tokenizedTags", outputCol="ngramsTags")
workingTrain = ngramTags.transform(workingTrain)

In [21]:
workingTrain.select("ngramsTags").show(1,False)

+--------------------------------------------------------+
|ngramsTags                                              |
+--------------------------------------------------------+
|[php, image-processing, file-upload, upload, mime-types]|
+--------------------------------------------------------+
only showing top 1 row



In [22]:
workingTrain = workingTrain.selectExpr("tokenizedBody", "tokenizedTitle","ngramsTags")

In [23]:
workingTrain.show()

+--------------------+--------------------+--------------------+
|       tokenizedBody|      tokenizedTitle|          ngramsTags|
+--------------------+--------------------+--------------------+
|[like, check, upl...|[check, uploaded,...|[php, image-proce...|
|[favorite, editor...|[prevent, firefox...|           [firefox]|
|[probably, simple...|[replace, special...| [c#, url, encoding]|
|[function, modify...|[modify, whois, c...|[php, api, file-g...|
|[using, machine, ...|[setting, proxy, ...|[proxy, active-di...|
|[image, post, , l...|[draw, barplot, w...|         [core-plot]|
|[decided, convert...|[fetch, xml, feed...|[c#, asp.net, win...|
|[know, .net, libr...|[.net, library, g...|[.net, javascript...|
|[using, sql, serv...|[sql, server, :, ...|[sql, variables, ...|
|[commercial, obfu...|[commercial, obfu...|[.net, obfuscatio...|
|[move, inbox/sent...|[migrate, mdaemon...|[postfix, migrati...|
|[import, matlab, ...|[r, error, invali...|[r, matlab, machi...|
|[may, sound, like...|[cr

In [24]:
#Convert Spark Dataframe to Pandas
workingTrain = workingTrain.toPandas()
workingTest = workingTest.toPandas()

In [25]:
#Display Pandas
#Converted the dataframe to Pandas to apply the scikit library
workingTrain
workingTest

Unnamed: 0,ID,tokenizedBody,tokenizedTitle
0,6034196,"[disable, site-specific, hotkeys, (and, if), a...","[getting, rid, site-specific, hotkeys]"
1,6034197,"[gateway-to-gateway, vpn, setup, linksys, rv04...","[nodes, inside, cisco, vpn., incoming, ssh, re..."
2,6034198,"[changing, vcenter, servers, recently,, old, v...","[remove, old, vcenter, servers, vmware, vspher..."
3,6034199,"[variable, lifted, contents, of,, div, page,, ...","[replace, <span>, element, var, containing, html]"
4,6034200,"[today, purchase, small, cms, system., cms, tr...","[php, included, html, content, affect, seo?]"
5,6034201,"[following, code, visual, studio, +, resharper...","[acronym, menu, visual, studio, 2010?]"
6,6034202,"[looking, way, iterate, registers, yardoc, @ma...","[iterating, registers, yardoc, `@macro`]"
7,6034203,"[importing, classic, asp, pages, new, sitefini...","[url, rewriting, winforms, console, application]"
8,6034219,"[alphabetise, (by, title), wordpress, posts?]","[alphabetised, wordpress, posts]"
9,6034204,"[working, asp.net, application, company, speci...","[create, non-culture, based, resourcemanager, ..."


In [26]:
#Reference: https://stackoverflow.com/questions/10526579/use-scikit-learn-to-classify-into-multiple-categories
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

classifier = OneVsRestClassifier(LinearSVC())
X_train = np.array(workingTrain["tokenizedBody"])
X_test = np.array(workingTest["tokenizedBody"])

In [27]:
y_train_text= np.array(workingTrain["ngramsTags"])

In [28]:
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train_text)

In [29]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=lambda doc: doc,lowercase=False)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

In [30]:
classifier.fit(X_train, Y)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
   ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [31]:
predicted = classifier.predict(X_train)

In [32]:
all_labels = mlb.inverse_transform(predicted)

In [33]:
#Print the predicted labels
all_labels

[(u'file-upload', u'image-processing', u'mime-types', u'php', u'upload'),
 (u'firefox',),
 (u'c#', u'encoding', u'url'),
 (u'api', u'file-get-contents', u'php'),
 (u'active-directory', u'jmeter', u'proxy'),
 (u'core-plot',),
 (u'asp.net', u'c#', u'windows-phone-7'),
 (u'.net', u'code-generation', u'javascript'),
 (u'calls', u'parameters', u'procedure', u'sql', u'variables'),
 (u'.net', u'obfuscation', u'reflector'),
 (u'mdaemon', u'migration', u'postfix'),
 (u'machine-learning', u'matlab', u'r'),
 (u'algorithm', u'language-agnostic', u'random'),
 (u'documentation', u'expl3', u'latex3'),
 (u'windows-7',),
 (u'conventions', u'php', u'url-routing'),
 (),
 (),
 (u'code-generation', u'javascript', u'minify', u'playframework'),
 (u'hash',
  u'multidimensional-array',
  u'php',
  u'simplexml-load-string',
  u'xml'),
 (u'cancer', u'healthcare', u'medical-science'),
 (u'.net', u'c#', u'linq'),
 (),
 (u'iis',),
 (u'c#',),
 (u'haskell',),
 (u'gnu', u'makefile', u'visual-studio'),
 (u'html', u'lin

In [34]:
#Reference for encoding the list to UTF-8 : https://stackoverflow.com/questions/27714750/encoding-a-list-of-tuples-with-python
decoded = [[word.encode("utf8") for word in sets] for sets in all_labels]

In [35]:
#Print the decoded list
decoded

[['file-upload', 'image-processing', 'mime-types', 'php', 'upload'],
 ['firefox'],
 ['c#', 'encoding', 'url'],
 ['api', 'file-get-contents', 'php'],
 ['active-directory', 'jmeter', 'proxy'],
 ['core-plot'],
 ['asp.net', 'c#', 'windows-phone-7'],
 ['.net', 'code-generation', 'javascript'],
 ['calls', 'parameters', 'procedure', 'sql', 'variables'],
 ['.net', 'obfuscation', 'reflector'],
 ['mdaemon', 'migration', 'postfix'],
 ['machine-learning', 'matlab', 'r'],
 ['algorithm', 'language-agnostic', 'random'],
 ['documentation', 'expl3', 'latex3'],
 ['windows-7'],
 ['conventions', 'php', 'url-routing'],
 [],
 [],
 ['code-generation', 'javascript', 'minify', 'playframework'],
 ['hash', 'multidimensional-array', 'php', 'simplexml-load-string', 'xml'],
 ['cancer', 'healthcare', 'medical-science'],
 ['.net', 'c#', 'linq'],
 [],
 ['iis'],
 ['c#'],
 ['haskell'],
 ['gnu', 'makefile', 'visual-studio'],
 ['html', 'line-breaks', 'semantic'],
 ['bounds-checker', 'c++', 'gcc', 'stl'],
 [],
 ['asp.net-m

In [44]:
from sklearn.metrics import f1_score
#The score is predicted on Train data.
#weighted: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). 
#This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.
#Predict the accuracy on the train set
f1_score(predicted,Y, average='weighted')


0.91959012867433809

In [45]:
#Micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.
f1_score(predicted,Y, average='micro')

0.89666515426497284

In [None]:
#Here I was trying to see the association rules in the body and Title sections. However, this section of the code did not produced 
#desired outcomes.

In [None]:
#Trying to fit association rules
#from pyspark.ml.fpm import FPGrowth
#TagsfpGrowth = FPGrowth(itemsCol="tokenizedTags")
#modelFPGrowth = TagsfpGrowth.fit(workingTrain)

In [None]:
#modelFPGrowth.associationRules.show()
#modelFPGrowth.freqItemsets.show()

In [None]:
#modelFPGrowth.transform(workingTrain).show()

In [None]:
#modelFPGrowth.associationRules.show()