# Distributed Computing: TF-IDF Pipeline and Analysis





In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1.Prepare Environment

### 1.1 Install Java, Pyspark and Spark NLP

In [2]:
import os

In [3]:
!apt-get update -qq
!apt-get install -y openjdk-8-jdk-headless -qq

Selecting previously unselected package openjdk-8-jre-headless:amd64.
(Reading database ... 155455 files and directories currently installed.)
Preparing to unpack .../openjdk-8-jre-headless_8u312-b07-0ubuntu1~18.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u312-b07-0ubuntu1~18.04) ...
Selecting previously unselected package openjdk-8-jdk-headless:amd64.
Preparing to unpack .../openjdk-8-jdk-headless_8u312-b07-0ubuntu1~18.04_amd64.deb ...
Unpacking openjdk-8-jdk-headless:amd64 (8u312-b07-0ubuntu1~18.04) ...
Setting up openjdk-8-jre-headless:amd64 (8u312-b07-0ubuntu1~18.04) ...
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/orbd to provide /usr/bin/orbd (orbd) in auto mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/servertool to provide /usr/bin/servertool (servertool) in auto mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/tnameserv to provide /usr/bin/tnameserv (tnameserv) in auto mode
Setting up ope

In [4]:
#Install Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

openjdk version "1.8.0_312"
OpenJDK Runtime Environment (build 1.8.0_312-8u312-b07-0ubuntu1~18.04-b07)
OpenJDK 64-Bit Server VM (build 25.312-b07, mixed mode)


In [5]:
# Install Pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.6.2

Collecting pyspark==2.4.4
  Downloading pyspark-2.4.4.tar.gz (215.7 MB)
[K     |████████████████████████████████| 215.7 MB 59 kB/s 
[?25hCollecting py4j==0.10.7
  Downloading py4j-0.10.7-py2.py3-none-any.whl (197 kB)
[K     |████████████████████████████████| 197 kB 19.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130392 sha256=114ad51392af1d0d7ed9bba564bd157ea7c1d4c6150c833c387e81b3447e6811
  Stored in directory: /root/.cache/pip/wheels/11/48/19/c3b6b66e4575c164407a83bc065179904ddc33c9d6500846f0
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4
Collecting spark-nlp==2.6.2
  Downloading spark_nlp-2.6.2-py2.py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 5.1 MB/s 
[?25hInstalling collected packages: spark-nlp
Successfully installed

### 1.2 Start  Spark Session

In [6]:
import sparknlp
spark = sparknlp.start()

from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

## 2.Get Classics Corpus

### 2.1 Convert txt files into Python Dataframe

In [7]:
import pandas as pd
import os
import re

In [8]:
directory ="/content/drive/MyDrive/Distributed-Computing/data/classic_literature/" #Change according to path
# directory ="data/classic_literature/" #Change according to path
text_type = 'C'

classics_df = pd.DataFrame(columns=['id', 'type', 'text'])

for filename in os.listdir(directory):
#filename = "data/classic_literature/45.txt"
    file_ext = os.path.basename(filename).rsplit('.',1)[1] #Get file extension
    if file_ext == "txt":
        with open(directory + '/' + filename, 'r') as file:
            text_id = os.path.basename(filename).rsplit('.',1)[0]
            corpus = file.read()
            corpus = re.sub(';', ' ', corpus)
            corpus = corpus.replace('Chapter', '')
            classics_df.loc[len(classics_df.index)] = [text_id, text_type, corpus]

In [9]:
classics_df

Unnamed: 0,id,type,text
0,1342,C,\n\n\n\n\nPRIDE AND PREJUDICE\n\nBy Jane Auste...
1,768,C,\n\n\n\nTranscribed from the 1910 John Murray ...
2,1260,C,\n\n\n\n\nTranscribed from the 1897 Service & ...
3,514,C,\n\n\n\nLITTLE WOMEN\n\n\nby\n\nLouisa May Alc...
4,1905,C,"\n\n\n\n\nTHE GOVERNESS \n\nOR, THE LITTLE FEM..."
5,113,C,\n\n\n\n\n\n\n\n\n\n\nIn Honor of Lisa Hart's ...
6,145,C,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMiddlemarch\n\n\...
7,45,C,\n\n\n\n\n ANNE OF GREEN GABL...


### 2.2 Convert Python Dataframe into Spark Dataframe

In [10]:
import re
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

#sc =  pyspark.SparkContext("local[*]", "Test Context")
sqlContext = SQLContext(spark)

In [11]:
df_Spark_classics = sqlContext.createDataFrame(classics_df) #Pyspark SQL dataframe

## 3.Get Fanfictions Corpus

### 3.1 Convert txt files into Python Dataframe

In [12]:
directory ="/content/drive/MyDrive/Distributed-Computing/data/fanfiction/" #Change according to path
# directory ="data/fanfiction/" #Change according to path
text_type = 'F'

fanfictions_df = pd.DataFrame(columns=['id', 'type', 'text'])

for filename in os.listdir(directory):
    file_ext = os.path.basename(filename).rsplit('.',1)[1] #Get file extension
    if file_ext == "txt":
        with open(directory + '/' + filename, 'r') as file:
            text_id = os.path.basename(filename).rsplit('.',1)[0]
            corpus = file.read()
            corpus = corpus.replace('Chapter', '')
            corpus = re.sub(';', ' ', corpus)
            fanfictions_df.loc[len(fanfictions_df.index)] = [text_id, text_type, corpus]

In [13]:
fanfictions_df

Unnamed: 0,id,type,text
0,fanfic_1536152,F,"1\nGoing back was the worst.I had hoped that,..."
1,fanfic_35367502,F,1\n“Lily there’s a boy at the door!”\nThe gin...
2,fanfic_33183868,F,1\n \n\n \n\nThere was something luminescent ...
3,fanfic_7441657,F,"1\nPrologue\nOctober 31, 1981\nThe view out t..."
4,fanfic_24025603,F,"1\nDisclaimer: I, by no means, claim to own a..."
5,fanfic_23824330,F,1\nAmy sits facing the window of her room. He...
6,fanfic_25042705,F,"1\n“But, really,” said Mrs. Bennet rather lou..."
7,fanfic_36819574,F,"1\nI was born in sunlight, and dragged into d..."
8,fanfic_8523001,F,1\nThere was such a cultural veil of secrecy ...


### 3.2 Convert Python Dataframe into Spark Dataframe

In [14]:
df_Spark_fanfictions = sqlContext.createDataFrame(fanfictions_df) #Pyspark SQL dataframe

## 4 Preprocess Texts

### 4.1 Create Preprocessing Pipeline

Create pipeline to preprocess the spark dataframe texts.

Each of these classes receive an input column and creates the output column.
At the end of the pipeline, we will have a dataframe with all of the columns that are created on the fly and their results.

The **last column** generated, in this case **token_features** is the one that has all the words after being preprocessed, removing stop words, etc.

In [15]:
#https://medium.com/spark-nlp/spark-nlp-101-document-assembler-500018f5f6b5
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document") \
    .setCleanupMode("shrink_full") #remove new lines and tabs, plus shrinking spaces and blank lines.

#https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp.annotator.Tokenizer.html
token = Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')

#https://nlp.johnsnowlabs.com/docs/en/annotators
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^A-Za-z]"""]) # remove punctuations and alphanumeric chars

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

In [16]:
nlp_pipeline_lr = Pipeline(
        stages=[document, 
                token,
                normalizer,
                finisher])

### 4.2 Apply Pipeline to Spark Dataframes

#### 4.2.1 Classics

In [17]:
processed_classics_df = nlp_pipeline_lr.fit(df_Spark_classics).transform(df_Spark_classics)

# Show Token Features column
# processed_classics_df.select("token_features").show(truncate=200) 

#### 4.2.2 Fanfictions

In [18]:
processed_fanfictions_df = nlp_pipeline_lr.fit(df_Spark_fanfictions).transform(df_Spark_fanfictions)

# Show Token Features column
# processed_fanfictions_df.select("token_features").show(truncate=200) 

At this point we have two preprocessed Spark Dataframes where each row belongs to one book. The column of interest is **"token_features"**, which contains all the **tokens** of the corpus.

1.   **processed_classics_df**: Contains the eiight classics (one per row)
2.   **processed_fanfictions_df**: Contains the eiight fanfictions (one per row)

--> From here, we can start doing additional processing like TF-IDF or other stuff to obtain the information we want.






## 5 Create a Single Spark DataFrame of all Texts

### 5.1 Combine Classics and Fanfiction 

In [19]:
# Classics Spark DataFrame
final_classics_df = processed_classics_df.select(['id', 'type', 'token_features'])

# Fanfictions Spark DataFrame
final_fanfictions_df = processed_fanfictions_df.select(['id', 'type', 'token_features'])

In [20]:
# Combine Classics and Fanfic DataFrames
final_df = final_classics_df.union(final_fanfictions_df)
final_df.schema

StructType(List(StructField(id,StringType,true),StructField(type,StringType,true),StructField(token_features,ArrayType(StringType,true),true)))

In [21]:
# SQL table named 'texts' from the final_df Spark dataframe
final_df.createOrReplaceTempView('texts')

### 5.2 Standardize the format of Id values

#### 5.2.1 Setup pyspark.sql

In [22]:
# Create a PySpark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("temp").getOrCreate()

In [23]:
# pyspark.sql imports
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import concat_ws, split, col, concat, lit

#### 5.2.2 UDF to process the Id values

In [24]:
# UDF to process the Id column 
@udf(returnType = StringType())
def get_id(id_string):
  ''' Given an id value, extracts the numeric portion of the id '''
  id = id_string.split('|')[0]
  clean_id = id.split('_')[-1]
  return(clean_id)

spark.udf.register("get_id" , get_id)

<function __main__.get_id>

#### 5.2.3 Apply UDF

In [25]:
# Reformat results to have column id, type, token, part of speech, and tfidf value
final_df = spark.sql('''SELECT get_id(id) as id, type, token_features FROM texts''')
final_df.createOrReplaceTempView("texts")

In [26]:
final_df.show(truncate = 100)

+--------+----+----------------------------------------------------------------------------------------------------+
|      id|type|                                                                                      token_features|
+--------+----+----------------------------------------------------------------------------------------------------+
|    1342|   C|[pride, and, prejudice, by, jane, austen, it, is, a, truth, universally, acknowledged, that, a, s...|
|     768|   C|[transcribed, from, the, john, murray, edition, by, david, price, email, ccxpglaforg, wuthering, ...|
|    1260|   C|[transcribed, from, the, service, paton, edition, by, david, price, email, ccxpglaforg, jane, eyr...|
|     514|   C|[little, women, by, louisa, may, alcott, contents, part, one, playing, pilgrims, two, a, merry, c...|
|    1905|   C|[the, governess, or, the, little, female, academy, by, sarah, fielding, there, lived, in, the, no...|
|     113|   C|[in, honor, of, lisa, harts, th, birthday, the, s

## 6. TFIDF Computation

### 6.1 Convert DataFrame to RDD

Each row will consist of tuples of the form (*id*|*type*, *token_features*). <br>

Having a single key (i.e. *id*|*type*) and a single value (i.e. a list of tokens) will make the RDD operations for TF-IDF simpler. 

In [27]:
final_df = final_df.select([
                            concat(col('id'),lit('|'),col('type')).alias("id_type"), # id | type
                            'token_features' # List of tokens
                            ])

In [28]:
final_rdd = final_df.rdd

### 6.2 Term Frequency

#### 6.2.1 Import NLTK for part-of-speech (POS) tagging

In [29]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

#### 6.2.2. Map-Reduce Process

In [30]:
# 1. Creates an RDD where there is row for each token in the corpus. 
# Creates rows of the form ((id_type, token), 1). 
tf_map = final_rdd.flatMap(lambda x: [((x[0], word), 1) for word in nltk.pos_tag(x[1])])
tf_map.take(5)

[(('1342|C', ('pride', 'NN')), 1),
 (('1342|C', ('and', 'CC')), 1),
 (('1342|C', ('prejudice', 'NN')), 1),
 (('1342|C', ('by', 'IN')), 1),
 (('1342|C', ('jane', 'NN')), 1)]

In [31]:
# 2. Compute the number of times each term appears in a document. 
# Creates rows of the form ((id_type, token), term_frequency). 
tf_reduce = tf_map.reduceByKey(lambda x, y: x + y)

In [32]:
# 3. Reformat term frequencies so the tokens are the keys.
# Creates rows of the form (token, pos), (id_type, term_frequency)). 
tf = tf_reduce.map(lambda x: (x[0][1],(x[0][0],x[1])))
tf.take(5)

[(('austen', 'NNS'), ('1342|C', 1)),
 (('good', 'JJ'), ('1342|C', 179)),
 (('known', 'VBN'), ('1342|C', 52)),
 (('or', 'CC'), ('1342|C', 295)),
 (('this', 'DT'), ('1342|C', 446))]

### 6.3 Document Frequency

In [33]:
# 1. Creates an RDD. There is one row for each distinct text, token pair. 
# Creates rows of the form (token, 1)
df_map = tf_reduce.map(lambda x:(x[0][1], 1))

In [34]:
# 2. Compute the number of documents containing each term. 
# Creates rows of the form (token, number of documents containing token)
df_final = df_map.reduceByKey(lambda x,y:x+y)
df_final.take(5)

[(('disclaimer', 'NN'), 2),
 (('by', 'IN'), 17),
 (('means', 'NNS'), 15),
 (('remotely', 'RB'), 5),
 (('the', 'DT'), 17)]

### 6.4 Inverse Document Frequency (IDF) 

In [35]:
import math
from pyspark.sql.functions import *

In [36]:
# 1. Compute the document frequency: log(NUM_DOCUMENTS/Number of Documents Containing Word)
# Creates rows of the form (token, pos), idf)). 
NUM_DOCUMENTS = 17
idf = df_final.map(lambda x: (x[0], math.log10(NUM_DOCUMENTS/x[1])))
idf.take(5)

[(('disclaimer', 'NN'), 0.9294189257142927),
 (('by', 'IN'), 0.0),
 (('means', 'NNS'), 0.054357662322592676),
 (('remotely', 'RB'), 0.5314789170422551),
 (('the', 'DT'), 0.0)]

### 6.5 Term Frequency-Inverse Document Frequency (TF-IDF)

#### 6.5.1 Join the RDDs with TF and IDF computations

In [37]:
# 1. Creates rows of the form (token, (id_type, term_frequency), idf)). 
join_df=tf.join(idf)

#### 6.5.2 Compute TF-IDF

In [38]:
# 1. Creates rows of the form (id_type, token, pos, term_frequency, idf, tf_idf). 
tfidf=join_df.map(lambda x: (x[1][0][0], #id_type
                             x[0], # token
                              x[1][0][1], #term_frequency
                              x[1][1], #idf
                              x[1][0][1]*x[1][1] #tf_idf
                              ))
tfidf.take(5)

[('24025603|F',
  ('means', 'NNS'),
  5,
  0.054357662322592676,
  0.27178831161296335),
 ('1342|C', ('means', 'NNS'), 41, 0.054357662322592676, 2.2286641552262996),
 ('1260|C', ('means', 'NNS'), 20, 0.054357662322592676, 1.0871532464518534),
 ('1905|C', ('means', 'NNS'), 14, 0.054357662322592676, 0.7610072725162975),
 ('7441657|F', ('means', 'NNS'), 9, 0.054357662322592676, 0.48921896090333405)]

### 6.6 Convert TF-IDF RDD to DataFrame

In [39]:
rdd_df = tfidf.toDF(["DocumentId","Token","TF","IDF","TFIDF"])
rdd_df.show(5)

+----------+------------+---+--------------------+-------------------+
|DocumentId|       Token| TF|                 IDF|              TFIDF|
+----------+------------+---+--------------------+-------------------+
|24025603|F|[means, NNS]|  5|0.054357662322592676|0.27178831161296335|
|    1342|C|[means, NNS]| 41|0.054357662322592676| 2.2286641552262996|
|    1260|C|[means, NNS]| 20|0.054357662322592676| 1.0871532464518534|
|    1905|C|[means, NNS]| 14|0.054357662322592676| 0.7610072725162975|
| 7441657|F|[means, NNS]|  9|0.054357662322592676|0.48921896090333405|
+----------+------------+---+--------------------+-------------------+
only showing top 5 rows



In [40]:
rdd_df.createOrReplaceTempView("df") # create SQL table

### 6.7 Reformat TF-IDF Dataframe

#### 6.7.1 UDFs for Reformating the TF-IDF Dataframe

In [41]:
# UDFs
@udf(returnType = StringType())
def get_id(id_string):
  '''Extract the id component from the document id_type string'''
  id = id_string.split('|')[0]
  clean_id = id.split('_')[-1]
  return(clean_id)

@udf(returnType = StringType())
def get_type(id_string):
  '''Extract the type component from the document id_type string'''
  doc_type = id_string.split('|')[1]
  return(doc_type)

@udf(returnType = StringType())
def get_token(pos_list):
  '''Extract the term component from the (term, pos) pair'''
  return(pos_list[0])

@udf(returnType = StringType())
def get_pos(pos_list):
  '''Extract the pos component from the (term, pos) pair'''
  return(pos_list[1])

# Register all udfs for use in sql statements
spark.udf.register("get_type" , get_type)
spark.udf.register("get_id" , get_id)
spark.udf.register("get_token" , get_token)
spark.udf.register("get_pos" , get_pos)

<function __main__.get_pos>

#### 6.7.2 Apply the UDFs

In [42]:
# Reformat dataframe so each row is of the form id, type, token, part of speech, and tfidf value.
results = spark.sql('select get_id(DocumentId) as id, get_type(DocumentId) as type, get_token(Token) as token, get_pos(Token) as pos, TFIDF as tfidf from df')
results.show(10)

+--------+----+-----+---+--------------------+
|      id|type|token|pos|               tfidf|
+--------+----+-----+---+--------------------+
|24025603|   F|means|NNS| 0.27178831161296335|
|    1342|   C|means|NNS|  2.2286641552262996|
|    1260|   C|means|NNS|  1.0871532464518534|
|    1905|   C|means|NNS|  0.7610072725162975|
| 7441657|   F|means|NNS| 0.48921896090333405|
|     514|   C|means|NNS| 0.27178831161296335|
|     113|   C|means|NNS| 0.10871532464518535|
|     145|   C|means|NNS|  1.9025181812907437|
|23824330|   F|means|NNS|0.054357662322592676|
| 8523001|   F|means|NNS| 0.38050363625814876|
+--------+----+-----+---+--------------------+
only showing top 10 rows



In [43]:
results.createOrReplaceTempView("results")

## 7. TF-IDF Analysis and Visualizations

### 7.1 Data for Plotting

#### 7.1.1 Map of Document IDs and Titles

In [44]:
id_title_dict = {
  "24025603": "Outnumbered", 
  "7441657": "Hermione Granger and the Perfectly Reasonable Explanation", 
  "23824330": "Amy's Progress Vlogs", 
  "8523001": "An Ever-Fixed Mark",
  "1536152" : "Manager", 
  "25042705": "First Impressions: A Modern Pride and Prejudice Adaptation", 
  "36819574": "Scarlet Justice", 
  "35367502": "The Bug Collector", 
  "33183868": "Tension Points",
  "1260": "Jane Eyre", 
  "113": "The Secret Garden", 
  "45": "Anne of Green Gables", 
  "768": "Wuthering Heights", 
  "1905": "The Governess; Or, The Little Female Academy", 
  "1342" : "Pride and Prejudice", 
  "145": "Middlemarch", 
  "514": "Little Women"
}

#### 7.1.2 UDFs for Transforming Data

In [45]:
@udf(returnType = StringType())
def convert_type(type_string):
  ''' Translate one-letter type flag to either 'Classics' or 'Fanfiction' string '''
  # return ('Classic' if type_string == 'C' else 'Fanfiction') # doesn't work
  id_string = type_string
  return ('Classic' if len(id_string) < 6 else 'Fanfiction') 
  
@udf(returnType = StringType())
def get_title(id_string): 
  ''' Obtain the document title from the document ID'''
  return (id_title_dict[id_string][:30])

# Register all udfs for use in sql statements
spark.udf.register("convert_type" , convert_type)
spark.udf.register("get_title" , get_title)

<function __main__.get_title>

#### 7.1.3 Obtain Dataframe for Plotting

In [46]:
# 1. Apply UDFs
dataplot = spark.sql('select id as ID, get_title(id) as Title, convert_type(id) as Type, Token, POS, TFIDF as TFIDF from results')
dataplot.show(10)

+--------+--------------------+----------+-----+---+--------------------+
|      ID|               Title|      Type|Token|POS|               TFIDF|
+--------+--------------------+----------+-----+---+--------------------+
|24025603|         Outnumbered|Fanfiction|means|NNS| 0.27178831161296335|
|    1342| Pride and Prejudice|   Classic|means|NNS|  2.2286641552262996|
|    1260|           Jane Eyre|   Classic|means|NNS|  1.0871532464518534|
|    1905|The Governess; Or...|   Classic|means|NNS|  0.7610072725162975|
| 7441657|Hermione Granger ...|Fanfiction|means|NNS| 0.48921896090333405|
|     514|        Little Women|   Classic|means|NNS| 0.27178831161296335|
|     113|   The Secret Garden|   Classic|means|NNS| 0.10871532464518535|
|     145|         Middlemarch|   Classic|means|NNS|  1.9025181812907437|
|23824330|Amy's Progress Vlogs|Fanfiction|means|NNS|0.054357662322592676|
| 8523001|  An Ever-Fixed Mark|Fanfiction|means|NNS| 0.38050363625814876|
+--------+--------------------+-------

In [47]:
# Create a SQL Table
dataplot.createOrReplaceTempView("dataplot")

### 7.2 Analysis and Visualizations

#### 7.2.1 Visualization: TF-IDF Bar Chart

In [48]:
import plotly.express as px

def analyze_tfidf(word): 
  '''Given a word, generates a bar graph that illustrates the importance of each word 
  in the classic and fanfiction texts''' 

  color_map = {"Classic":'#636EFA', "Fanfiction":'#EF553B'}

  # Filter the Spark dataframe for the word of interest
  sql_query = f"SELECT ID, Title, Token, Type, TFIDF FROM dataplot WHERE token = '{word}'"
  word_df = spark.sql(sql_query)

  # Convert the spark dataframe to a pandas dataframe
  word_pd = word_df.toPandas()

  # Generate a plotly graph of TF-IDF values
  fig = px.bar(word_pd, 
               x="TFIDF", 
               y="Title", 
               color="Type", 
               title = f"Importance of '{word}' in Classics and Fanfiction",
               category_orders={"type": ["Classic", "Fanfiction"]}, 
               color_discrete_map = color_map, 
               template='plotly_white')
  fig.update_layout(yaxis={'categoryorder':'total ascending'}, font=dict(size=14))
  fig.show()

#### 7.2.2 Analysis: Concept of Beauty

In [49]:
analyze_tfidf('beauty')

#### 7.2.3 Analysis: Character Traits

In [50]:
analyze_tfidf('brave')

#### 7.2.4 Analysis: Working women

In [51]:
analyze_tfidf('work')