# Exercise 2: Advanced Analytics NLP

In [1]:
import pandas as pd
pd.set_option('max_colwidth', 800)

import pyspark.sql.functions as F

# Create a spark context that includes a 3rd party jar for NLP

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .getOrCreate()
spark

# Read multiple files in a dir as one Dataframe

In [3]:
dataPath = "./Data/Reddit/*.json"
df = spark.read.json(dataPath)
print(df.count())
df.printSchema()

1000
root
 |-- author: string (nullable = true)
 |-- created: double (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)



# Deal with Struct type to query subfields 

In [20]:
title = "data.title"
author = "data.author"
dfAuthorTitle = df.select(df.title, df.author)
dfTitle = df.select(df.title)
dfAuthorTitle.limit(5).toPandas()

Unnamed: 0,title,author
0,"I‚Äôve found a few funny memories during lockdown. This is from my 1st tour in 89, backstage in Vegas.",t2_11yd5w
1,Times Square right now,t2_cxhbp
2,Joe Biden elected president of the United States,t2_fkqop
3,"The Senate. Upvote this so that people see it when they Google ""The Senate"".",t2_nifnj
4,My cab driver tonight was so excited to share with me that he‚Äôd made the cover of the calendar. I told him I‚Äôd help let the world see,t2_aa1ng


# Try to implement the equivalent of flatMap in dataframes

In [5]:
dfWordCount = df.select(F.explode(F.split(dfAuthorTitle.title, "\\s+")) \
            .alias("word")) \
            .groupBy("word") \
            .count() \
            .orderBy(F.desc('count'))
dfWordCount.limit(10).toPandas()

Unnamed: 0,word,count
0,the,377
1,a,322
2,to,276
3,of,196
4,in,192
5,and,165
6,I,152
7,is,131
8,for,123
9,my,121


# Use an NLP libary to do Part-of-Speech Tagging

In [22]:
# Loading Libraries
from nltk.tag import DefaultTagger
  
# Defining Tag
tagging = DefaultTagger('NN')

# Transform exploded data to list of words
pd_df = pd.DataFrame(data=dfTitle.toPandas())
pd_df

# Tagging
tagging.tag(pd_df.values.tolist())

[(['I‚Äôve found a few funny memories during lockdown. This is from my 1st tour in 89, backstage in Vegas.'],
  'NN'),
 (['Times Square right now'], 'NN'),
 (['Joe Biden elected president of the United States'], 'NN'),
 (['The Senate. Upvote this so that people see it when they Google "The Senate".'],
  'NN'),
 (['My cab driver tonight was so excited to share with me that he‚Äôd made the cover of the calendar. I told him I‚Äôd help let the world see'],
  'NN'),
 (['UPVOTE so everyone sees we got SUPPORT'], 'NN'),
 (['A short story'], 'NN'),
 (['Guardians of the Front Page'], 'NN'),
 (['This is what happens when one company owns dozens of local news stations'],
  'NN'),
 (['Heat index was 110 degrees so we offered him a cold drink. He went for a full body soak instead'],
  'NN'),
 (['GME YOLO update ‚Äî Jan 28 2021'], 'NN'),
 (['If this is you: Fuck you'], 'NN'),
 (['She did her best ok?'], 'NN'),
 (['Take your time, you got this'], 'NN'),
 (['Leaked Drone footage of shackled and blindf

## Deal with Map type to query subfields

In [None]:
dfPos = # Todo

In [None]:
dfPos= # Todo

## Keep only proper nouns NNP or NNPS

In [None]:
nnpFilter = "pos.result = 'NNP' or pos.result = 'NNPS' "
dfNNP = # Todo

## Extract columns form a map in a col

In [None]:
dfWordTag = # Todo

In [None]:
from pyspark.sql.functions import desc
# Todo