## Load Dataset from csv

In [37]:
from pyspark.sql import DataFrame

#Load a csv file and convert to DataFrame
df = spark.read.option("header","true").csv("data/nepal_train.csv")
df = df.select(df.tweet_text,df.label.cast("double").alias("label"))
df = df.dropna()
#df.write.csv("data/mycsv.csv")
df.createOrReplaceTempView("tweets")
all_tweets = spark.sql("SELECT tweet_text, label FROM tweets")

# Feature extraction

## Tokenize

In [38]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="tweet_text", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(all_tweets)
#tok = tokenized.select("label", "words")

tok = tokenized.select("words","label")\
    .withColumn("tokens", countTokens(col("words")))
#tok = tokenized.select("words","tokens","label")    

## Stop words remover

In [39]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filtered = remover.transform(tok)
filtered = filtered.select("filtered","tokens","label")
#filtered.show()

## NGram

In [76]:
from pyspark.ml.feature import NGram

ngram = NGram(n=3, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(tok)
ngramDataFrame = ngramDataFrame.select("ngrams")
#ngramDataFrame.show()

## HashingTF and IDF

In [70]:
from pyspark.ml import linalg as ml_linalg

def as_mllib(v):
    if isinstance(v, ml_linalg.SparseVector):
        return MLLibVectors.sparse(v.size, v.indices, v.values)
    elif isinstance(v, ml_linalg.DenseVector):
        return MLLibVectors.dense(v.toArray())
    else:
        raise TypeError("Unsupported type: {0}".format(type(v)))

In [69]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.linalg import Vector as MLVector, Vectors as MLVectors
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors
from pyspark.mllib.regression import LabeledPoint

hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(filtered)
# alternatively, CountVectorizer can also be used to get term frequency vectors
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
tfidfData = idfModel.transform(featurizedData)
tfidfData.select("filtered", "tokens", "features","label")#.show()
pairs = tfidfData.select("label","features").rdd
training = pairs.map(lambda x: LabeledPoint(x[0], as_mllib(x[1])))

# Train a model

## Naive Bayes

In [73]:
from pyspark.mllib.classification import NaiveBayes 
# Train and check
model = NaiveBayes.train(training)