# Distributed Computing - Project 1





In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1.Prepare Environment

### 1.1 Install Java, Pyspark and Spark NLP

In [2]:
import os

In [3]:
!apt-get update -qq
!apt-get install -y openjdk-8-jdk-headless -qq

In [4]:
#Install Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

openjdk version "1.8.0_312"
OpenJDK Runtime Environment (build 1.8.0_312-8u312-b07-0ubuntu1~18.04-b07)
OpenJDK 64-Bit Server VM (build 25.312-b07, mixed mode)


In [5]:
# Install Pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.6.2

Collecting pyspark==2.4.4
  Using cached pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached py4j-0.10.7-py2.py3-none-any.whl (197 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4
Collecting spark-nlp==2.6.2
  Using cached spark_nlp-2.6.2-py2.py3-none-any.whl (128 kB)
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.6.2


### 1.2 Start  Spark Session

In [6]:
import sparknlp
spark = sparknlp.start()

from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

## 2.Get Classics Corpus

### 2.1 Convert txt files into Python Dataframe

In [7]:
import pandas as pd
import os
import re

In [8]:
directory ="/content/drive/MyDrive/Distributed-Computing/data/classic_literature/" #Change according to path
# directory ="data/classic_literature/" #Change according to path
text_type = 'C'

classics_df = pd.DataFrame(columns=['id', 'type', 'text'])

for filename in os.listdir(directory):
#filename = "data/classic_literature/45.txt"
    file_ext = os.path.basename(filename).rsplit('.',1)[1] #Get file extension
    if file_ext == "txt":
        with open(directory + '/' + filename, 'r') as file:
            text_id = os.path.basename(filename).rsplit('.',1)[0]
            corpus = file.read()
            corpus = re.sub(';', ' ', corpus)
            corpus = corpus.replace('Chapter', '')
            classics_df.loc[len(classics_df.index)] = [text_id, text_type, corpus]

In [9]:
classics_df

Unnamed: 0,id,type,text
0,1342,C,\n\n\n\n\nPRIDE AND PREJUDICE\n\nBy Jane Auste...
1,768,C,\n\n\n\nTranscribed from the 1910 John Murray ...
2,1260,C,\n\n\n\n\nTranscribed from the 1897 Service & ...
3,514,C,\n\n\n\nLITTLE WOMEN\n\n\nby\n\nLouisa May Alc...
4,1905,C,"\n\n\n\n\nTHE GOVERNESS \n\nOR, THE LITTLE FEM..."
5,113,C,\n\n\n\n\n\n\n\n\n\n\nIn Honor of Lisa Hart's ...
6,145,C,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMiddlemarch\n\n\...
7,45,C,\n\n\n\n\n ANNE OF GREEN GABL...


### 2.2 Convert Python Dataframe into Spark Dataframe

In [10]:
import re
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

#sc =  pyspark.SparkContext("local[*]", "Test Context")
sqlContext = SQLContext(spark)

In [11]:
df_Spark_classics = sqlContext.createDataFrame(classics_df) #Pyspark SQL dataframe

## 3.Get Fanfictions Corpus

### 3.1 Convert txt files into Python Dataframe

In [12]:
directory ="/content/drive/MyDrive/Distributed-Computing/data/fanfiction/" #Change according to path
# directory ="data/fanfiction/" #Change according to path
text_type = 'F'

fanfictions_df = pd.DataFrame(columns=['id', 'type', 'text'])

for filename in os.listdir(directory):
    file_ext = os.path.basename(filename).rsplit('.',1)[1] #Get file extension
    if file_ext == "txt":
        with open(directory + '/' + filename, 'r') as file:
            text_id = os.path.basename(filename).rsplit('.',1)[0]
            corpus = file.read()
            corpus = corpus.replace('Chapter', '')
            corpus = re.sub(';', ' ', corpus)
            fanfictions_df.loc[len(fanfictions_df.index)] = [text_id, text_type, corpus]

In [13]:
fanfictions_df

Unnamed: 0,id,type,text
0,fanfic_1536152,F,"1\nGoing back was the worst.I had hoped that,..."
1,fanfic_35367502,F,1\n“Lily there’s a boy at the door!”\nThe gin...
2,fanfic_33183868,F,1\n \n\n \n\nThere was something luminescent ...
3,fanfic_7441657,F,"1\nPrologue\nOctober 31, 1981\nThe view out t..."
4,fanfic_24025603,F,"1\nDisclaimer: I, by no means, claim to own a..."
5,fanfic_23824330,F,1\nAmy sits facing the window of her room. He...
6,fanfic_25042705,F,"1\n“But, really,” said Mrs. Bennet rather lou..."
7,fanfic_36819574,F,"1\nI was born in sunlight, and dragged into d..."
8,fanfic_8523001,F,1\nThere was such a cultural veil of secrecy ...


### 3.2 Convert Python Dataframe into Spark Dataframe

In [14]:
df_Spark_fanfictions = sqlContext.createDataFrame(fanfictions_df) #Pyspark SQL dataframe

## 4 Preprocess Texts

### 4.1 Create Preprocessing Pipeline

Create pipeline to preprocess the spark dataframe texts.

Each of these classes receive an input column and creates the output column.
At the end of the pipeline, we will have a dataframe with all of the columns that are created on the fly and their results.

The **last column** generated, in this case **token_features** is the one that has all the words after being preprocessed, removing stop words, etc.

In [15]:
#https://medium.com/spark-nlp/spark-nlp-101-document-assembler-500018f5f6b5
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document") \
    .setCleanupMode("shrink_full") #remove new lines and tabs, plus shrinking spaces and blank lines.

#https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp.annotator.Tokenizer.html
token = Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')

#https://nlp.johnsnowlabs.com/docs/en/annotators
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^A-Za-z]"""]) # remove punctuations and alphanumeric chars

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

In [16]:
nlp_pipeline_lr = Pipeline(
        stages=[document, 
                token,
                normalizer,
                finisher])

### 4.2 Apply Pipeline to Spark Dataframes

#### 4.2.1 Classics

In [17]:
processed_classics_df = nlp_pipeline_lr.fit(df_Spark_classics).transform(df_Spark_classics)

# Show Token Features column
# processed_classics_df.select("token_features").show(truncate=200) 

#### 4.2.2 Fanfictions

In [18]:
processed_fanfictions_df = nlp_pipeline_lr.fit(df_Spark_fanfictions).transform(df_Spark_fanfictions)

# Show Token Features column
# processed_fanfictions_df.select("token_features").show(truncate=200) 

At this point we have two preprocessed Spark Dataframes where each row belongs to one book. The column of interest is **"token_features"**, which contains all the **tokens** of the corpus.

1.   **processed_classics_df**: Contains the eiight classics (one per row)
2.   **processed_fanfictions_df**: Contains the eiight fanfictions (one per row)

--> From here, we can start doing additional processing like TF-IDF or other stuff to obtain the information we want.






### 5 Create a Single Spark DataFrame of all Texts

### 5.1 Combine Classics and Fanfiction 

In [19]:
# Select only the id, type, and token columns
final_classics_df = processed_classics_df.select(['id', 'type', 'token_features'])
final_fanfictions_df = processed_fanfictions_df.select(['id', 'type', 'token_features'])

In [20]:
# Combine the texts into a single Spark dataframe
final_df = final_classics_df.union(final_fanfictions_df)
final_df.schema

StructType(List(StructField(id,StringType,true),StructField(type,StringType,true),StructField(token_features,ArrayType(StringType,true),true)))

In [21]:
# Create a SQL table named 'texts' from the final_df Spark dataframe
final_df.createOrReplaceTempView('texts')

#### 5.2 Standardize the format of Id values

#### 5.2.1 Setup pyspark.sql

In [22]:
# Create a PySpark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("temp").getOrCreate()

In [23]:
# pyspark.sql imports
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import concat_ws, split, col, concat, lit

#### 5.2.2 UDF to process the Id values

In [24]:
# UDF to process the Id column 
@udf(returnType = StringType())
def get_id(id_string):
  ''' Given an id value, extracts the numeric portion of the id '''
  id = id_string.split('|')[0]
  clean_id = id.split('_')[-1]
  return(clean_id)

spark.udf.register("get_id" , get_id)

<function __main__.get_id>

#### 5.2.3 Apply UDF

In [25]:
# Reformat results to have column id, type, token, part of speech, and tfidf value
final_df = spark.sql('''SELECT get_id(id) as id, type, token_features FROM texts''')
final_df.createOrReplaceTempView("texts")

In [26]:
final_df.show(truncate = 100)

+--------+----+----------------------------------------------------------------------------------------------------+
|      id|type|                                                                                      token_features|
+--------+----+----------------------------------------------------------------------------------------------------+
|    1342|   C|[pride, and, prejudice, by, jane, austen, it, is, a, truth, universally, acknowledged, that, a, s...|
|     768|   C|[transcribed, from, the, john, murray, edition, by, david, price, email, ccxpglaforg, wuthering, ...|
|    1260|   C|[transcribed, from, the, service, paton, edition, by, david, price, email, ccxpglaforg, jane, eyr...|
|     514|   C|[little, women, by, louisa, may, alcott, contents, part, one, playing, pilgrims, two, a, merry, c...|
|    1905|   C|[the, governess, or, the, little, female, academy, by, sarah, fielding, there, lived, in, the, no...|
|     113|   C|[in, honor, of, lisa, harts, th, birthday, the, s

### 6. TFIDF Computation

Import and Setup NLTK

In [27]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Restructure columns for ease of RDD mapping

In [28]:
final_df = final_df.select([concat(col('id'),lit('|'),col('type')).alias("id_type"), # id | type
                            'token_features'])

In [29]:
final_rdd = final_df.rdd

Map-Reduce Process: Term Frequency

In [30]:
# 1. Prep to compute the term count. 
# Result: ((id | type, token_tagged), 1) tuples. 
map1 = final_rdd.flatMap(lambda x: [((x[0], # id | type
                                      word), 1) for word in nltk.pos_tag(x[1])])
map1.take(5)

[(('1342|C', ('pride', 'NN')), 1),
 (('1342|C', ('and', 'CC')), 1),
 (('1342|C', ('prejudice', 'NN')), 1),
 (('1342|C', ('by', 'IN')), 1),
 (('1342|C', ('jane', 'NN')), 1)]

In [31]:
reduce = map1.reduceByKey(lambda x, y: x + y)

In [32]:
reduce.take(1)

[(('1342|C', ('austen', 'NNS')), 1)]

In [33]:
tf=reduce.map(lambda x: (x[0][1],(x[0][0],x[1])))

In [34]:
tf_final = tf.map(lambda x: (x[0][0], (x[0][1], x[1][0], x[1][1])))

In [35]:
map3=reduce.map(lambda x: (x[0][1],(x[0][0],x[1],1)))

Map-Reduce Process: Document Frequency

In [36]:
map4=map3.map(lambda x:(x[0],x[1][2]))

In [37]:
# Number of documents containing the word
reduce2=map4.reduceByKey(lambda x,y:x+y)

Map-Reduce Process: IDF

In [38]:
import math
from pyspark.sql.functions import *
n = 17
idf=reduce2.map(lambda x: (x[0],math.log10(n/x[1])))

In [39]:
idf_final = idf.map(lambda x: (x[0][0], (x[0][1], x[1])))

Map-Reduce Process: TF-IDF computation

In [40]:
rdd=tf.join(idf)

In [41]:
rdd=rdd.map(lambda x: (x[1][0][0],(x[0],x[1][0][1],x[1][1],x[1][0][1]*x[1][1])))

In [42]:
rdd=rdd.map(lambda x: (x[0],x[1][0],x[1][1],x[1][2],x[1][3]))

Map-Reduce Process: Final DataFrame

In [43]:
rdd_df = rdd.toDF(["DocumentId","Token","TF","IDF","TFIDF"])
rdd_df.createOrReplaceTempView("df") # create SQL table

In [44]:
rdd_df.show(5)

+----------+------------+---+--------------------+-------------------+
|DocumentId|       Token| TF|                 IDF|              TFIDF|
+----------+------------+---+--------------------+-------------------+
|24025603|F|[means, NNS]|  5|0.054357662322592676|0.27178831161296335|
|    1342|C|[means, NNS]| 41|0.054357662322592676| 2.2286641552262996|
|    1260|C|[means, NNS]| 20|0.054357662322592676| 1.0871532464518534|
|    1905|C|[means, NNS]| 14|0.054357662322592676| 0.7610072725162975|
| 7441657|F|[means, NNS]|  9|0.054357662322592676|0.48921896090333405|
+----------+------------+---+--------------------+-------------------+
only showing top 5 rows



In [45]:
@udf(returnType = StringType())
def get_type(id_string):
  doc_type = id_string.split('|')[1]
  return(doc_type)

@udf(returnType = StringType())
def get_id(id_string):
  id = id_string.split('|')[0]
  clean_id = id.split('_')[-1]
  return(clean_id)

@udf(returnType = StringType())
def get_token(pos_list):
  return(pos_list[0])

@udf(returnType = StringType())
def get_pos(pos_list):
  return(pos_list[1])

spark.udf.register("get_type" , get_type)
spark.udf.register("get_id" , get_id)
spark.udf.register("get_token" , get_token)
spark.udf.register("get_pos" , get_pos)

<function __main__.get_pos>

In [46]:
# Reformat results to have column id, type, token, part of speech, and tfidf value
results = spark.sql('select get_id(DocumentId) as id, get_type(DocumentId) as type, get_token(Token) as token, get_pos(Token) as pos, TFIDF as tfidf from df')
results.createOrReplaceTempView("results")

In [47]:
results.show(10)

+--------+----+-----+---+--------------------+
|      id|type|token|pos|               tfidf|
+--------+----+-----+---+--------------------+
|24025603|   F|means|NNS| 0.27178831161296335|
|    1342|   C|means|NNS|  2.2286641552262996|
|    1260|   C|means|NNS|  1.0871532464518534|
|    1905|   C|means|NNS|  0.7610072725162975|
| 7441657|   F|means|NNS| 0.48921896090333405|
|     514|   C|means|NNS| 0.27178831161296335|
|     113|   C|means|NNS| 0.10871532464518535|
|     145|   C|means|NNS|  1.9025181812907437|
|23824330|   F|means|NNS|0.054357662322592676|
| 8523001|   F|means|NNS| 0.38050363625814876|
+--------+----+-----+---+--------------------+
only showing top 10 rows



Data Plot

In [48]:
@udf(returnType = StringType())
def convert_type(type_string):
  # return ('Classic' if type_string == 'C' else 'Fanfiction') # doesn't work
  id_string = type_string
  return ('Classic' if len(id_string) < 6 else 'Fanfiction') 
  
@udf(returnType = StringType())
def get_title(id_string): 

  id_title_dict = {
    "24025603": "Outnumbered", 
    "7441657": "Hermione Granger and the Perfectly Reasonable Explanation", 
    "23824330": "Amy's Progress Vlogs", 
    "8523001": "An Ever-Fixed Mark",
    "1536152" : "Manager", 
    "25042705": "First Impressions: A Modern Pride and Prejudice Adaptation", 
    "36819574": "Scarlet Justice", 
    "35367502": "The Bug Collector", 
    "33183868": "Tension Points",
    "1260": "Jane Eyre", 
    "113": "The Secret Garden", 
    "45": "Anne of Green Gables", 
    "768": "Wuthering Heights", 
    "1905": "The Governess; Or, The Little Female Academy", 
    "1342" : "Pride and Prejudice", 
    "145": "Middlemarch", 
    "514": "Little Women"
  }

  return (id_title_dict[id_string][:30])

spark.udf.register("convert_type" , convert_type)
spark.udf.register("get_title" , get_title)

<function __main__.get_title>

In [49]:
  id_title_dict = {
    "24025603": "Outnumbered", 
    "7441657": "Hermione Granger and the Perfectly Reasonable Explanation", 
    "23824330": "Amy's Progress Vlogs", 
    "8523001": "An Ever-Fixed Mark",
    "1536152" : "Manager", 
    "25042705": "First Impressions: A Modern Pride and Prejudice Adaptation", 
    "36819574": "Scarlet Justice", 
    "35367502": "The Bug Collector", 
    "33183868": "Tension Points",
    "1260": "Jane Eyre", 
    "113": "The Secret Garden", 
    "45": "Anne of Green Gables", 
    "768": "Wuthering Heights", 
    "1905": "The Governess; Or, The Little Female Academy", 
    "1342" : "Pride and Prejudice", 
    "145": "Middlemarch", 
    "514": "Little Women"
  }

list(id_title_dict.values())

['Outnumbered',
 'Hermione Granger and the Perfectly Reasonable Explanation',
 "Amy's Progress Vlogs",
 'An Ever-Fixed Mark',
 'Manager',
 'First Impressions: A Modern Pride and Prejudice Adaptation',
 'Scarlet Justice',
 'The Bug Collector',
 'Tension Points',
 'Jane Eyre',
 'The Secret Garden',
 'Anne of Green Gables',
 'Wuthering Heights',
 'The Governess; Or, The Little Female Academy',
 'Pride and Prejudice',
 'Middlemarch',
 'Little Women']

In [62]:
dataplot = spark.sql('select id as ID, get_title(id) as Title, convert_type(id) as Type, Token, POS, TFIDF as TFIDF from results')
dataplot.show(10)
dataplot.createOrReplaceTempView("dataplot")

+--------+--------------------+----------+-----+---+--------------------+
|      ID|               Title|      Type|Token|POS|               TFIDF|
+--------+--------------------+----------+-----+---+--------------------+
|24025603|         Outnumbered|Fanfiction|means|NNS| 0.27178831161296335|
|    1342| Pride and Prejudice|   Classic|means|NNS|  2.2286641552262996|
|    1260|           Jane Eyre|   Classic|means|NNS|  1.0871532464518534|
|    1905|The Governess; Or...|   Classic|means|NNS|  0.7610072725162975|
| 7441657|Hermione Granger ...|Fanfiction|means|NNS| 0.48921896090333405|
|     514|        Little Women|   Classic|means|NNS| 0.27178831161296335|
|     113|   The Secret Garden|   Classic|means|NNS| 0.10871532464518535|
|     145|         Middlemarch|   Classic|means|NNS|  1.9025181812907437|
|23824330|Amy's Progress Vlogs|Fanfiction|means|NNS|0.054357662322592676|
| 8523001|  An Ever-Fixed Mark|Fanfiction|means|NNS| 0.38050363625814876|
+--------+--------------------+-------

## TFIDF Analysis

In [99]:
import plotly.express as px

def analyze_tfidf(word): 
  '''Given a word, generates a bar graph illustrating the importance of the word 
  in the texts from the canon literature and the fanfiction texts''' 

  color_map = {"Classic":'#636EFA', "Fanfiction":'#EF553B'}

  # Obtain a dataframe with all occurrences of the given word. 
  # Specifies the text it was found in and the corresponding tfidf value
  sql_query = f"SELECT ID, Title, Token, Type, TFIDF FROM dataplot WHERE token = '{word}'"
  word_df = spark.sql(sql_query)
  word_pd = word_df.toPandas()

  # Generate plotly graph
  fig = px.bar(word_pd, x="TFIDF", y="Title", color="Type", 
               title = f"Importance of '{word}' in Classics and Fanfiction", #facet_col="type"
               category_orders={"type": ["Classic", "Fanfiction"]}, color_discrete_map = color_map, template='plotly_white')
  fig.update_layout(yaxis={'categoryorder':'total ascending'}, font=dict(size=14))
  fig.show()

### Analysis of 'beauty' in the texts

In [100]:
# importance of beauty metric how much texts focus on physical beauty
analyze_tfidf('beauty')

Character Trait

In [101]:
analyze_tfidf('brave')

Modern day woman (and person) work. Unlike women of the past. Working women

In [98]:
analyze_tfidf('work')