## Load Dataset from csv

In [1]:
from pyspark.sql import DataFrame

#Load a csv file and convert to DataFrame
df = spark.read.option("header","true").csv("data/nepal_train.csv")
df = df.select(df.tweet_text,df.label.cast("double").alias("label"))
df = df.dropna()
#df.write.csv("data/mycsv.csv")
df.createOrReplaceTempView("tweets")
all_tweets = spark.sql("SELECT tweet_text, label FROM tweets")

# Feature extraction

## Tokenize

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="tweet_text", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(all_tweets)
#tok = tokenized.select("label", "words")

tok = tokenized.select("words","label")\
    .withColumn("tokens", countTokens(col("words")))
#tok = tokenized.select("words","tokens","label")    

In [4]:
tokenized.show()

+--------------------+-----+--------------------+
|          tweet_text|label|               words|
+--------------------+-----+--------------------+
|in pictures man i...|  0.0|[in, pictures, ma...|
|still visiting ne...|  1.0|[still, visiting,...|
|sending love to n...|  1.0|[sending, love, t...|
|devastating love ...|  0.0|[devastating, lov...|
|for so many years...|  1.0|[for, so, many, y...|
|god this is nepal...|  0.0|[god, this, is, n...|
|prayers for #nepa...|  1.0|[prayers, for, #n...|
|more than killed ...|  0.0|[more, than, kill...|
|like said these c...|  0.0|[like, said, thes...|
|earthquake of mag...|  0.0|[earthquake, of, ...|
|please tired of r...|  1.0|[please, tired, o...|
|visit this link t...|  0.0|[visit, this, lin...|
|      nepal fighting|  0.0|   [nepal, fighting]|
|may god give stre...|  1.0|[may, god, give, ...|
|#israelinnepal nu...|  1.0|[#israelinnepal, ...|
|nepal#kathmanduqu...|  1.0|[nepal#kathmanduq...|
|#nepal #uae nnepa...|  0.0|[#nepal, #uae, nn...|


## Stop words remover

In [3]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filtered = remover.transform(tok)
filtered = filtered.select("filtered","tokens","label")
#filtered.show()

## NGram

In [4]:
from pyspark.ml.feature import NGram

ngram = NGram(n=3, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(tok)
ngramDataFrame = ngramDataFrame.select("ngrams")
#ngramDataFrame.show()

## HashingTF and IDF

In [5]:
from pyspark.ml import linalg as ml_linalg

def as_mllib(v):
    if isinstance(v, ml_linalg.SparseVector):
        return MLLibVectors.sparse(v.size, v.indices, v.values)
    elif isinstance(v, ml_linalg.DenseVector):
        return MLLibVectors.dense(v.toArray())
    else:
        raise TypeError("Unsupported type: {0}".format(type(v)))

In [6]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.linalg import Vector as MLVector, Vectors as MLVectors
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors
from pyspark.mllib.regression import LabeledPoint

hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(filtered)
# alternatively, CountVectorizer can also be used to get term frequency vectors
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
tfidfData = idfModel.transform(featurizedData)
tfidfData.select("filtered", "tokens", "features","label")#.show()
pairs = tfidfData.select("label","features").rdd
data = pairs.map(lambda x: LabeledPoint(x[0], as_mllib(x[1])))

# Train a model

## Linear SVM

In [7]:
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [8]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel

# Build the model
model = SVMWithSGD.train(train, iterations=100)

# Evaluating the model on training data
labelsAndPreds = train.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(train.count())
print("Training Error = " + str(trainErr))
# Save and load model
model.save(sc, "target/tmp/pythonSVMWithSGDModel")
#sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")

Training Error = 0.10869968466


In [14]:
sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")

## Lets create a pipeline

In [8]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])
pipeline.save("target/tmp/pythonHashingTFModel")