<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Sparknlp-setup" data-toc-modified-id="Sparknlp-setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Sparknlp setup</a></span></li><li><span><a href="#Useful-Functions" data-toc-modified-id="Useful-Functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Useful Functions</a></span></li><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#sparknlp-API" data-toc-modified-id="sparknlp-API-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>sparknlp API</a></span></li><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Train test split</a></span></li><li><span><a href="#NLP-Pipeline" data-toc-modified-id="NLP-Pipeline-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>NLP Pipeline</a></span><ul class="toc-item"><li><span><a href="#Document-assembling" data-toc-modified-id="Document-assembling-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Document assembling</a></span></li><li><span><a href="#Tokenizing" data-toc-modified-id="Tokenizing-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Tokenizing</a></span></li><li><span><a href="#Normalizer" data-toc-modified-id="Normalizer-7.3"><span class="toc-item-num">7.3&nbsp;&nbsp;</span>Normalizer</a></span></li><li><span><a href="#Remove-stopwords" data-toc-modified-id="Remove-stopwords-7.4"><span class="toc-item-num">7.4&nbsp;&nbsp;</span>Remove stopwords</a></span></li><li><span><a href="#Stemming" data-toc-modified-id="Stemming-7.5"><span class="toc-item-num">7.5&nbsp;&nbsp;</span>Stemming</a></span></li><li><span><a href="#Finisher" data-toc-modified-id="Finisher-7.6"><span class="toc-item-num">7.6&nbsp;&nbsp;</span>Finisher</a></span></li><li><span><a href="#TF-IDF" data-toc-modified-id="TF-IDF-7.7"><span class="toc-item-num">7.7&nbsp;&nbsp;</span>TF-IDF</a></span></li><li><span><a href="#String-Indexing" data-toc-modified-id="String-Indexing-7.8"><span class="toc-item-num">7.8&nbsp;&nbsp;</span>String Indexing</a></span></li><li><span><a href="#Classifier-Model" data-toc-modified-id="Classifier-Model-7.9"><span class="toc-item-num">7.9&nbsp;&nbsp;</span>Classifier Model</a></span></li><li><span><a href="#Create-pipeline" data-toc-modified-id="Create-pipeline-7.10"><span class="toc-item-num">7.10&nbsp;&nbsp;</span>Create pipeline</a></span></li></ul></li><li><span><a href="#Train-the-model" data-toc-modified-id="Train-the-model-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Train the model</a></span></li><li><span><a href="#Model-Predictions" data-toc-modified-id="Model-Predictions-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Model Predictions</a></span></li><li><span><a href="#Model-evaluation" data-toc-modified-id="Model-evaluation-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Model evaluation</a></span></li></ul></div>

# Description
BBC data link: https://www.kaggle.com/yufengdev/bbc-text-categorization?#Get-the-data


# Sparknlp setup

In [1]:
#=============== setup sparknlp
import os
import sys

sys.path.append("/Users/poudel/opt/miniconda3/envs/mysparknlp/lib/python3.7/site-packages")
os.environ["SPARK_HOME"] = "/Users/poudel/Softwares/Spark/spark-2.4.4-bin-hadoop2.7"
os.environ["PYSPARK_PYTHON"] = "/Users/poudel/opt/miniconda3/envs/mysparknlp/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
#================ setup sparknlp end

# Useful Functions

In [17]:
def show_methods(obj, ncols=7,
            start=None, inside=None,exclude=None,
            caps_only=False,lower_only=False,
                            printt=False):
    """ Show all the attributes of a given method.

    Parameters
    -----------
    obj: object
        Name of python object. eg. list, pd.DataFrame
    ncols: int
        Number of columns
    start: str
        Substring the attribute starsts with.
    inside: str or tuple or list
        Show only these attributes if given substring exists.
    exclude: str or tuple or list
        Exclude these exact elements
    caps_only: bool
        Show only Title case words
    lower_only: bool
        Show only lowercase case words
    printt: bool
        Print the dataframe or not.

    """

    # print(f'Object Type: {type(obj)}\n')
    lst = [i for i in dir(obj) if i[0]!='_' ]

    # exclude usual imports
    usual_imports = ['np','pd','os','sys','time','psycopg2',
                    'plt','string','px',
                    're','nltk','sklearn','spacy']
    lst = [i for i in lst
            if i not in usual_imports ]

    # capital only (for classes)
    if caps_only:
        lst = [i for i in lst if i[0].isupper()]

    # lowercase only (method attributes)
    if lower_only:
        lst = [i for i in lst if i[0].islower()]

    # starts with something
    if isinstance(start,str):
        lst = [i for i in lst if i.startswith(start)]

    if isinstance(start,tuple) or isinstance(start,list):
        lst = [i for i in lst for start_i in start
                if i.startswith(start_i)]

    # inside something
    if isinstance(inside,str):
        lst = [i for i in lst if inside in i]
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [i for i in lst for inside_i in inside
                if inside_i in i]

    # exclude substring
    if isinstance(exclude,str):
        lst = [i for i in lst if i != exclude]

    if isinstance(exclude,tuple) or isinstance(exclude,list):
        lst = [i for i in lst if i not in exclude]

    # ouput dataframe
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')

    # for terminal sometimes we need to print
    if printt:
        print(df)

    return df

# Load the libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('max_columns',100)

import time,os,json
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

[(x.__name__,x.__version__) for x in [np,pd,sns]]

[('numpy', '1.19.1'), ('pandas', '0.23.4'), ('seaborn', '0.10.1')]

In [28]:
import sparknlp
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

spark = sparknlp.start()

In [48]:
import pyspark.sql.functions as F

In [4]:
file_location = r'bbc-text.csv'
file_type = "csv"
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

sdf = (spark.read.format(file_type)
  .option("inferSchema", infer_schema)
  .option("header", first_row_is_header)
  .option("sep", delimiter)
  .load(file_location)
     )

sdf.count()

2225

In [5]:
sdf.show(2)

+--------+--------------------+
|category|                text|
+--------+--------------------+
|    tech|tv future in the ...|
|business|worldcom boss  le...|
+--------+--------------------+
only showing top 2 rows



In [46]:
sdf.groupby('category').count().show()

+-------------+-----+
|     category|count|
+-------------+-----+
|        sport|  511|
|     politics|  417|
|entertainment|  386|
|     business|  510|
|         tech|  401|
+-------------+-----+



In [50]:
sdf.agg(F.countDistinct("category")).show()

+------------------------+
|count(DISTINCT category)|
+------------------------+
|                       5|
+------------------------+



# sparknlp API

In [18]:
show_methods(sparknlp)

Unnamed: 0,0,1,2,3,4,5,6
0,Chunk2Doc,Finisher,TokenAssembler,annotator,base,embeddings,start
1,Doc2Chunk,SparkSession,annotation,annotators,common,internal,version
2,DocumentAssembler,,,,,,


In [20]:
show_methods(sparknlp.base,3)

Unnamed: 0,0,1,2
0,ABC,Finisher,RecursiveEstimator
1,Annotation,HasRecursiveFit,RecursivePipeline
2,AnnotatorProperties,HasRecursiveTransform,RecursivePipelineModel
3,AnnotatorTransformer,JavaEstimator,RecursiveTransformer
4,Chunk2Doc,LightPipeline,TokenAssembler
5,Doc2Chunk,Param,Transformer
6,DocumentAssembler,Params,TypeConverters
7,EmbeddingsFinisher,Pipeline,keyword_only
8,Estimator,PipelineModel,


In [22]:
show_methods(sparknlp.common,3)

Unnamed: 0,0,1,2
0,AnnotatorApproach,HasExcludableStorage,Param
1,AnnotatorModel,HasStorage,Params
2,AnnotatorProperties,HasStorageModel,ReadAs
3,CoverageResult,HasStorageRef,RecursiveAnnotatorApproach
4,ExternalResource,JavaEstimator,RegexRule
5,HasCaseSensitiveProperties,JavaMLWritable,TypeConverters
6,HasEmbeddingsProperties,JavaModel,keyword_only


In [24]:
show_methods(sparknlp.embeddings,3)

Unnamed: 0,0,1,2
0,AlbertEmbeddings,NGramGenerator,Tokenizer
1,AnnotatorApproach,NerApproach,TokenizerModel
2,AnnotatorModel,NerConverter,TypeConverters
3,AnnotatorProperties,NerCrfApproach,TypedDependencyParserApproach
4,BertEmbeddings,NerCrfModel,TypedDependencyParserModel
5,BertSentenceEmbeddings,NerDLApproach,UniversalSentenceEncoder
6,BigTextMatcher,NerDLModel,ViveknSentimentApproach
7,BigTextMatcherModel,NerOverwriter,ViveknSentimentModel
8,ChunkEmbeddings,Normalizer,WordEmbeddings
9,ChunkTokenizer,NormalizerModel,WordEmbeddingsModel


# Train test split

In [6]:
sdf_train, sdf_test = sdf.randomSplit([0.7, 0.3], seed=SEED)

sdf_train.count(), sdf_test.count()

(1561, 664)

# NLP Pipeline

In [7]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Document assembling

In [8]:
document_assembler = sparknlp.DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

## Tokenizing

In [11]:
tokenizer = sparknlp.embeddings.Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")

## Normalizer

In [12]:
 normalizer = sparknlp.annotator.Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

## Remove stopwords

In [13]:
stopwords_cleaner = sparknlp.annotator.StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

## Stemming

In [14]:
stemmer = sparknlp.annotator.Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

## Finisher

In [16]:
finisher = sparknlp.base.Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

## TF-IDF

Ref: https://spark.apache.org/docs/2.1.0/ml-features.html

In [29]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [33]:
hashingTF = HashingTF(inputCol="token_features",
                      outputCol="rawFeatures",
                      numFeatures=1000)

idf = pyspark.ml.feature.IDF(inputCol="rawFeatures",
          outputCol="features",
          minDocFreq=5)

## String Indexing

In [31]:
from pyspark.ml.feature import StringIndexer

In [30]:
label_stringIdx = pyspark.ml.feature.StringIndexer(inputCol = "category", outputCol = "label")

## Classifier Model

In [32]:
from pyspark.ml.classification import LogisticRegression

In [35]:
logreg = LogisticRegression(maxIter=10,
                        regParam=0.3,
                        elasticNetParam=0.0)

In [36]:
label_to_stringIdx = pyspark.ml.feature.IndexToString(inputCol="label",
                                   outputCol="article_class")

## Create pipeline

In [37]:
from pyspark.ml import Pipeline

In [38]:
nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf,
            label_stringIdx,
            lr,
            label_to_stringIdx])

# Train the model

In [39]:
pipeline_model = nlp_pipeline.fit(sdf_train)

# Model Predictions

In [40]:
predictions =  pipeline_model.transform(sdf_test)

# Model evaluation

In [41]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy")


accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))

Accuracy = 0.957831


In [42]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedPrecision")

weightedPrecision = evaluator.evaluate(predictions)
print("weightedPrecision = %g" % (weightedPrecision))

weightedPrecision = 0.958096


In [43]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedRecall")

weightedRecall = evaluator.evaluate(predictions)
print("weightedRecall = %g" % (weightedRecall))

weightedRecall = 0.957831
