In [5]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder \
    .appName("project")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.4")\
    .config("spark.sql.broadcastTimeout", "36000")\
    .getOrCreate()

sc = spark.sparkContext
sc

In [6]:
spark

In [7]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('words')

from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [66]:
documentAssembler = DocumentAssembler() \
    .setInputCol('context1') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [67]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [11]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("article_type", StringType(), True),
    StructField("np1", StringType(), True),
    StructField("np2", StringType(), True),
    StructField("context", StringType(), True),
    StructField("source", StringType(), True),
    StructField("category", StringType(), True),
    StructField("location", StringType(), True),
    StructField("time", StringType(), True),])

In [12]:
df = spark.read.csv("s3://anly502project/data/part-r-00000",sep = "\t",header=False,schema=schema)
df_2 = spark.read.csv("s3://anly502project/data/part-r-00001",sep = "\t",header=False,schema=schema)

In [13]:
#### Data Schema
df.printSchema()
df_2.printSchema()

root
 |-- article_type: string (nullable = true)
 |-- np1: string (nullable = true)
 |-- np2: string (nullable = true)
 |-- context: string (nullable = true)
 |-- source: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- time: string (nullable = true)

root
 |-- article_type: string (nullable = true)
 |-- np1: string (nullable = true)
 |-- np2: string (nullable = true)
 |-- context: string (nullable = true)
 |-- source: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- time: string (nullable = true)



In [14]:
import functools 

def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

In [15]:
unioned_df = unionAll([df, df_2])

In [16]:
#### show combined
unioned_df.show(10)

+------------+---------------+---+--------------------+-------------+--------------------+--------------------+-----+
|article_type|            np1|np2|             context|       source|            category|            location| time|
+------------+---------------+---+--------------------+-------------+--------------------+--------------------+-----+
|     article|    Dark Knight|  E|  arg1 and Wall arg2|             |intlnews topstor ...|      , kerala india|14299|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |topstor,health,sc...|                   ,|14660|
|     article|    Communities|  E|arg1 mobilised in...|             |    politics topstor|                   ,|14026|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |topstor,health,sc...|                   ,|14660|
|            |     Coast bias|  E|arg2 is for East ...|             |      sports topstor| columbus, ohio u...|13956|
|     article|Commerce office|  E|arg1 at DDD Linco...| 

In [17]:
from pyspark.sql.functions import split
split_col = split(unioned_df['category'], ',')
unioned_df = unioned_df.withColumn('category', split_col.getItem(0))

In [18]:
split_col_2 = split(unioned_df['category'], ' ')
unioned_df = unioned_df.withColumn('category', split_col_2.getItem(0))

In [19]:
split_col_3 = split(unioned_df['category'], '_')
unioned_df = unioned_df.withColumn('category', split_col_3.getItem(0))

In [20]:
split_col_4 = split(unioned_df['category'], '-')
unioned_df = unioned_df.withColumn('category', split_col_4.getItem(0))

In [21]:
#### Show data frame after filtering the category
unioned_df.show(10)

+------------+---------------+---+--------------------+-------------+---------+--------------------+-----+
|article_type|            np1|np2|             context|       source| category|            location| time|
+------------+---------------+---+--------------------+-------------+---------+--------------------+-----+
|     article|    Dark Knight|  E|  arg1 and Wall arg2|             | intlnews|      , kerala india|14299|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |  topstor|                   ,|14660|
|     article|    Communities|  E|arg1 mobilised in...|             | politics|                   ,|14026|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |  topstor|                   ,|14660|
|            |     Coast bias|  E|arg2 is for East ...|             |   sports| columbus, ohio u...|13956|
|     article|Commerce office|  E|arg1 at DDD Linco...|             |  topstor| canton, ohio uni...|14363|
|     article| 75-minute mark|  E|arg

In [64]:
from pyspark.sql.functions import concat, col, lit

unioned_df1 = unioned_df.select(concat(col("np1"), lit(' '),col("context")).alias('context1'))
unioned_df1.show()

+--------------------+
|            context1|
+--------------------+
|Dark Knight arg1 ...|
|Carotenoids arg1 ...|
|Communities arg1 ...|
|Carotenoids arg1 ...|
|Coast bias arg2 i...|
|Commerce office a...|
|75-minute mark ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Drill Sergeant ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Brigham Circle ar...|
|Cold arg2 Cape co...|
|Cook arg1 Off is ...|
|Boob arg1 How my ...|
|2002 arg2 Street ...|
|Boob arg1 How my ...|
+--------------------+
only showing top 20 rows



In [65]:
from pyspark.sql.functions import monotonically_increasing_id

df1 = unioned_df.withColumn("id", monotonically_increasing_id())

df2 = unioned_df1.withColumn("id", monotonically_increasing_id())


df3 = df2.join(df1, "id", "outer").drop("id")

df3.show()

+--------------------+------------+------------------+---+--------------------+--------------------+---------+--------------------+-----+
|            context1|article_type|               np1|np2|             context|              source| category|            location| time|
+--------------------+------------+------------------+---+--------------------+--------------------+---------+--------------------+-----+
|Dane arg1 gain In...|     article|              Dane|  E|arg1 gain In Grou...|                    | intlnews|                   ,|14223|
|Dubai arg2 arlier...|            |             Dubai|  E|arg2 arlier in th...|                    |localnews| nampa, idaho uni...|13892|
|Coffee arg1 Tycoo...|            |            Coffee|  E|arg1 Tycoon is ra...|     Blogcritics.org|  topstor|                  , |13765|
|10 years arg2 for...|     article|          10 years|  E|arg2 for Everyone...|TheCelebrityCafe.com|  topstor|    ,  united states|14669|
|100-acre parcel a...|     article

In [68]:
#### Filter out the null and empty category 
df3 =df3.filter(df3.category.isNotNull())
df3 = df3.filter(df3.category != '')

In [69]:
import pyspark.sql as sql
count_df = df3.groupBy("category").count()
count_df.createOrReplaceTempView("count_df")
count_rank_df = spark.sql("SELECT category, count FROM count_df ORDER BY count DESC LIMIT 15")

In [70]:
count_rank_df.show(10)

+-------------+--------+
|     category|   count|
+-------------+--------+
|      topstor|85027590|
|    localnews|40174779|
|       sports|22925153|
|     business|20017107|
| nationalnews| 5073191|
|     intlnews| 4252106|
|    technolog| 2546612|
|entertainment| 2359067|
|     politics| 2092784|
|     lifestle| 1785076|
+-------------+--------+
only showing top 10 rows



In [71]:
df3.createOrReplaceTempView("unioned_df")
count_rank_df.createOrReplaceTempView("count_rank_df")
df_final = spark.sql("SELECT * FROM unioned_df WHERE unioned_df.category IN (SELECT category FROM count_rank_df)")

In [None]:
df_final.show(10)

In [27]:
categories = [
 'politics',
'science',
'health',
'technolog',
'entertainment']

In [28]:
from pyspark.sql.functions import explode, col

# initialize {company: {word counts}} dictionary
category_word_counts_dict = {category: {} for category in categories}

for category in categories:
    print(category)
    # get complaint narratives
    category_df = df_final.filter(df_final['category'] == category)
    data = category_df.select('context')
    
    # process narratives into counts dictionary
    clean_data = pipeline.fit(data).transform(data)
    clean_data_exploded = clean_data.withColumn("exploded_text", explode(col("finished_clean_lemma")))
    counts = clean_data_exploded.groupby('exploded_text').count().toPandas()
    counts_dict = {counts.loc[i, 'exploded_text']: counts.loc[i, 'count'] for i in range(counts.shape[0])}
    
    # add counts to dictionary
    category_word_counts_dict[category] = counts_dict

politics
science
health
technolog
entertainment


In [29]:
category_word_counts_dict

{'politics': {'pitcher': 48,
  'lawlor': 4,
  'still': 3295,
  'lieutenant': 68,
  'reintegration': 11,
  'nunez': 8,
  'requirement': 533,
  'thwart': 249,
  'embrace': 777,
  'travel': 1723,
  'hope': 3179,
  'input': 40,
  'priority': 372,
  'fiscally': 17,
  'recognize': 927,
  'medicare': 246,
  'cramp': 16,
  'inner': 53,
  'roundabout': 3,
  'online': 509,
  'everyday': 70,
  'barrier': 94,
  'trail': 480,
  'earl': 10,
  'electrical': 191,
  'atef': 3,
  'art': 232,
  'precautionary': 8,
  'interaction': 41,
  'hanna': 16,
  'tarnish': 49,
  'ransom': 47,
  'vladimir': 118,
  'sariev': 1,
  'carnegie': 24,
  'marxism': 2,
  'elevate': 56,
  'nyayo': 2,
  'randomly': 144,
  'spoil': 80,
  'poach': 22,
  'implore': 104,
  'dts': 3,
  'nauseate': 1,
  'indigenous': 80,
  'timetable': 227,
  'peerage': 11,
  'abruptness': 1,
  'disassemble': 7,
  'transaction': 82,
  'indicator': 32,
  'crest': 10,
  'confidentiality': 50,
  'persist': 79,
  'degrade': 21,
  'likelihood': 26,
  'sh

In [30]:

def term_frequency(BoW_dict):
    tot_words = sum(BoW_dict.values())
    freq_dict = {word: BoW_dict[word]/tot_words for word in BoW_dict.keys()}
    return freq_dict

In [31]:

from math import log

def inverse_document_frequency(list_of_dicts):
    tot_docs = len(list_of_dicts)
    words = set([w for w_dict in list_of_dicts for w in w_dict.keys()])
    idf_dict = {word: log(float(tot_docs)/(1.0+ sum([1 for w_dict in list_of_dicts if word in w_dict.keys()]))) for word in words}
    return idf_dict

In [32]:
def tf_idf(list_of_dicts):
    words = set([w for w_dict in list_of_dicts for w in w_dict.keys()])
    tf_idf_dicts = []
    idfs = inverse_document_frequency(list_of_dicts)
    for w_dict in list_of_dicts:
        w_dict.update({word: 0 for word in words if word not in w_dict.keys()})
        tf = term_frequency(w_dict)
        tf_idf_dicts.append({word: tf[word]*idfs[word] for word in words})
    return tf_idf_dicts


In [33]:
list_of_word_dicts = [category_word_counts_dict[category] for category in categories]
tf_idf_by_category_list = tf_idf(list_of_word_dicts)
tf_idf_by_category_dict = {c: tf_dict for c, tf_dict in zip(categories, tf_idf_by_category_list)}

In [34]:
tf_idf_by_category_dict

{'politics': {'treason': 0.0,
  'bayartsaihan': 0.0,
  'lundeen': 1.3075067388148017e-07,
  'addiction': -4.200024191105487e-07,
  'orac': 0.0,
  'cimo': 1.1726666115596369e-07,
  'pingfederate': 0.0,
  'bartlett': -3.0333508046872964e-06,
  'realignment': 0.0,
  'txi': 0.0,
  'curbside': 0.0,
  'subrogated': 1.1726666115596369e-07,
  'merger': -1.166673386418191e-06,
  'yeast': 0.0,
  'beltre': 0.0,
  'futureheads': 0.0,
  'roadcasting': 0.0,
  'attract': -4.246691126562215e-05,
  'jaipuria': 0.0,
  'yeng': 0.0,
  'propulsion': 0.0,
  'creditable': 0.0,
  'mazzaro': 0.0,
  'smo': 0.0,
  'navigator': 0.0,
  'gutless': 0.0,
  'unknownst': 0.0,
  'thurbert': 0.0,
  'silas': 0.0,
  'journalist': -4.9000282229564025e-06,
  'cmss': 0.0,
  'namedto': 0.0,
  'allium': 0.0,
  'nasa': -6.066701609374593e-07,
  'jdm': 0.0,
  'bisphenol': 0.0,
  'xmlhttp': 0.0,
  'tompetty': 0.0,
  'aurorae': 0.0,
  'wilmerhale': 5.863333057798185e-07,
  'barrymore': 0.0,
  'spiderland': 0.0,
  'benigni': 0.0,
  