# Screencast Code

The follow code is the same used in the "Text Processing" screencast. Run each code cell to see how 

In [96]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, \
    IDF, StringIndexer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
from pyspark.ml.regression import LinearRegression


import re

In [57]:
# create a SparkSession: note this step was left out of the screencast
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

# Read in the Data Set

In [58]:
stack_overflow_data = 'Train_onetag_small.json'

In [59]:
df = spark.read.json(stack_overflow_data)

In [60]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php')

# Tokenization

Tokenization splits strings into separate words. Spark has a [Tokenizer](https://spark.apache.org/docs/latest/ml-features.html#tokenizer) class as well as RegexTokenizer, which allows for more control over the tokenization process.

In [61]:
# split the body text into separate words
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

# CountVectorizer

In [62]:
# find the term frequencies of the words
cv = CountVectorizer(inputCol="words", outputCol="TF", vocabSize=1000)
cvmodel = cv.fit(df)
df = cvmodel.transform(df)
df.take(1)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [63]:
# show the last 10 terms in the vocabulary
cvmodel.vocabulary[-10:]

['customer',
 'desktop',
 'buttons',
 'previous',
 'math',
 'master',
 '000',
 'blog',
 'comes',
 'wordpress']

# Inter-document Frequency

In [64]:
idf = IDF(inputCol="TF", outputCol="TFIDF")
idfModel = idf.fit(df)
df = idfModel.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

# StringIndexer

In [65]:
indexer = StringIndexer(inputCol="oneTag", outputCol="label")
df = indexer.fit(df).transform(df)

In [66]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

## Quiz

show Id = 1112 words length

In [67]:
body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("BodyLength", body_length(df.words))

In [68]:
df.select(["BodyLength", "Id"]).where(df.Id == "1112").show()

+----------+----+
|BodyLength|  Id|
+----------+----+
|        63|1112|
+----------+----+



Create a new column that concatenates the question title and body.
Apply the same functions we used before to compute the number of words in this combined column.
What's the value in this new column for Id = 5123?

In [69]:
regexTokenizer = RegexTokenizer(inputCol="Title", outputCol="title_word", pattern="\\W")
df = regexTokenizer.transform(df)

In [70]:
title_body_length = udf(lambda x,y : len(x)+len(y), IntegerType())
df = df.withColumn("TitleBodyLength", title_body_length(df.words, df.title_word))

In [71]:
df.select(["TitleBodyLength", "Id"]).where(df.Id == "5123").show()

+---------------+----+
|TitleBodyLength|  Id|
+---------------+----+
|            135|5123|
+---------------+----+



Create a vector from the combined Title + Body length column. 
In the next few questions, you'll try different normalizer/scaler methods on this new column.
Using the Normalizer method what's the normalized value for question Id = 512?

In [72]:
number_of_paragraphs = udf(lambda x, y : len(re.findall("</p>", x+' '+y)), IntegerType())
number_of_links = udf(lambda x, y: len(re.findall("</a>", x+' '+y)), IntegerType())

In [73]:
df = df.withColumn("NumParagraphs", number_of_paragraphs(df.Body, df.Title))
df = df.withColumn("NumLinks", number_of_links(df.Body, df.Title))

In [74]:
assembler = VectorAssembler(inputCols=["TitleBodyLength", "NumParagraphs", "NumLinks"], outputCol="NumFeatures")
df = assembler.transform(df)

In [76]:
scaler = Normalizer(inputCol="NumFeatures", outputCol="ScaledNumFeatures")
df = scaler.transform(df)

In [77]:
df.select(["ScaledNumFeatures", "Id"]).where(df.Id == "512").show()

+--------------------+---+
|   ScaledNumFeatures| Id|
+--------------------+---+
|[0.99938499379176...|512|
+--------------------+---+



Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512?

In [78]:
scaler2 = StandardScaler(inputCol="NumFeatures", outputCol="ScaledNumFeatures2", withStd=True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)

In [83]:
# df.select("ScaledNumFeatures2").where(df.Id == "512")
df.filter(df.Id == '512').show()

+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+--------------------+-----+----------+--------------------+---------------+-------------+--------+--------------+--------------------+--------------------+
|                Body| Id|                Tags|               Title|oneTag|               words|                  TF|               TFIDF|label|BodyLength|          title_word|TitleBodyLength|NumParagraphs|NumLinks|   NumFeatures|   ScaledNumFeatures|  ScaledNumFeatures2|
+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+--------------------+-----+----------+--------------------+---------------+-------------+--------+--------------+--------------------+--------------------+
|<p>I'd like to ha...|512|java optimization...|How can I see the...|  java|[p, i, d, like, t...|(1000,[0,1,2,3,4,...|(1000,[0,1,2,3,4,...|  2.0|        46|[how, can, i, see...|     

Using the MinMAxScaler method what's the normalized value for question Id = 512?

In [86]:
scaler3 = MinMaxScaler(inputCol="NumFeatures", outputCol="ScaledNumFeatures3")
scalerModel = scaler3.fit(df)
df = scalerModel.transform(df)

In [88]:
df.select("ScaledNumFeatures3").where(df.Id == "512").show()


+--------------------+
|  ScaledNumFeatures3|
+--------------------+
|[0.00624833820792...|
+--------------------+

