## Import packages

In [10]:
from pyspark.sql import DataFrame
from pyspark.ml import *

## Load dataset as DataFrame from JSON

In [11]:
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files
path = "data/nepal_pos.json"
tweetsDF = spark.read.json(path)

In [12]:
# The inferred schema can be visualized using the printSchema() method
tweetsDF.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- id_str: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true

In [13]:
# Creates a temporary view using the DataFrame
tweetsDF.createOrReplaceTempView("tweets")

In [15]:
# SQL statements can be run by using the sql methods provided by spark
tweetTextsDF = spark.sql("SELECT id,text FROM tweets")
tweetTextsDF.count()
#tweetTextsDF.show()

4687

## Read from text file

In [4]:
from pyspark.sql import Row

In [9]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("data/nepal_pos_tot.csv")
header = lines.first() #extract header
lines = lines.filter(lambda row : row != header)   #filter out header
parts = lines.map(lambda l: l.split(","))
tweets = parts.map(lambda p: Row(tweet_id=p[0], tweet_text=p[1], label=p[2]))

# Infer the schema, and register the DataFrame as a table.
schemaTweets = spark.createDataFrame(tweets)
schemaTweets.createOrReplaceTempView("tweets")
# SQL can be run over DataFrames that have been registered as a table.
positive_tweets = spark.sql("SELECT tweet_text FROM tweets")
tweetTexts = positive_tweets.rdd.map(lambda p: "Text: " + p.tweet_text).collect()
for tweet_text in tweetTexts:
    print(tweet_text)

Text: "#indiawithnepal my thoughts with those affected in nepal quake very strong tremors in padrauna kushinagar too we re just bordering nepal"
Text: "nepal earthquake in maps tweets and pictures"
Text: "#news nepal home ministry says at least people killed in the earthquake nepal home ministry says #tu"
Text: "#earthquake helpline numbers of the indian embassy in nepal \n+ \n\n+"
Text: "more than killed in powerful nepal #earthquake say government officials and police"
Text: "#earthquakeinnepal\nhelpline numbers of indian embassy in kathmandu \n\n \n\nplease help by re tweeting"
Text: "divastating earthquake shakes the soul of the people #nepal"
Text: "#news tv dead injured in bangladesh from nepal quake tv report dead injured in banglades #tu"
Text: "pictures from kathmandu show damage of magnitude #earthquake"
Text: "tv dead injured in bangladesh from nepal quake tv report dead injured in bangladesh from eart"
Text: "#breaking at least killed in nepal earthquake home ministry"
Text

In [7]:
spark.read.option("header","true").csv("data/nepal_pos_tot.csv")

DataFrame[tweet_id: string, tweet_text: string, label: string]

## Tokenise

In [26]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="tweet_text", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(positive_tweets)
tokenized.select("tweet_text", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
    
regexTokenized = regexTokenizer.transform(positive_tweets)
regexTokenized.select("tweet_text", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)    

+------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|tweet_text                                                                                                                                |words                                                                                                                                                           |tokens|
+------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|"#indiawithnepal my thoughts with those affected in nepal quake very str

## Stop words remover

In [27]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
remover.transform(tokenized).show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+
|tweet_text                                                                                                                                |words                                                                                                                                                           |filtered                                                                                                             |
+------------------------------------------------------------------------------------------------------------------------------------------+--------------------

## NGram

In [29]:
from pyspark.ml.feature import NGram

ngram = NGram(n=3, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(tokenized)
ngramDataFrame.select("ngrams").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                                                                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## HashingTF and IDF

In [32]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(tokenized)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("tweet_text", "features").show()

+--------------------+--------------------+
|          tweet_text|            features|
+--------------------+--------------------+
|"#indiawithnepal ...|(20,[0,4,5,7,8,10...|
|"nepal earthquake...|(20,[0,5,9,13,16]...|
|"#news nepal home...|(20,[5,6,8,10,16,...|
|"#earthquake help...|(20,[2,3,5,7,8,9,...|
|"more than killed...|(20,[0,1,5,8,10,1...|
|"#earthquakeinnep...|(20,[0,2,3,5,7,9,...|
|"divastating eart...|(20,[2,3,6,9,10,1...|
|"#news tv dead in...|(20,[1,5,6,8,10,1...|
|"pictures from ka...|(20,[0,1,3,15,18,...|
|"tv dead injured ...|(20,[1,5,8,9,10,1...|
|"#breaking at lea...|(20,[5,8,13,16,17...|
|"magnitude quake ...|(20,[1,3,7,8,9,10...|
|"earthquake km nn...|(20,[1,3,6,8,9,12...|
|"witness nepal qu...|(20,[0,1,7,8,9,10...|
|"people finder li...|(20,[2,7,9,10,13,...|
|"tv dead injured ...|(20,[1,5,8,9,10,1...|
|"weve just launch...|(20,[2,5,7,8,9,10...|
|"indian air force...|(20,[1,3,4,6,7,8,...|
|"may allah look a...|(20,[5,7,8,9,10,1...|
|"it is also climb...|(20,[1,5,6