In [1]:
import sparknlp
spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
from allComponents import Components
from pyspark.sql import SQLContext
from pyspark import SparkContext
from functools import reduce
from pyspark.sql.functions import udf
from pyspark.ml.classification import LogisticRegression,LinearSVC
from pyspark.ml.classification import RandomForestClassifier, OneVsRest
from pyspark.ml.classification import NaiveBayes,MultilayerPerceptronClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.functions import *
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
sc =SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [37]:
import pandas as pd
dataframe = pd.read_csv('../Document Categorisation.csv',engine="python")
dataframe['path'] = [i.split('\\') for i in dataframe['Location']]
dataframe.to_csv('../PathCategorisation.csv')

In [38]:
dataframe

Unnamed: 0,Document Name,Location,Category 1 (Mandatory),Category 2 (Optional),Category 3 (Optional),path
0,Complaints policy.docx,C:\Users\classifier-admin\Desktop\Data\ACQSC (...,Policy,,,"[C:, Users, classifier-admin, Desktop, Data, A..."
1,abbeyhouseagedcare2610-6.pdf,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Audit,Abbey House Aged Care,,"[C:, Users, classifier-admin, Desktop, Data, A..."
2,aacqa_annual_report_2017-18.pdf,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Annual Report,,,"[C:, Users, classifier-admin, Desktop, Data, A..."
3,aacqa_annual_report_accessibility_17_november_...,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Annual Report,,,"[C:, Users, classifier-admin, Desktop, Data, A..."
4,aacqa_table_a_executive_remuneration.docx,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Annual Report,Executive Remuneration,,"[C:, Users, classifier-admin, Desktop, Data, A..."
...,...,...,...,...,...,...
562,NFP-Principles-and-Guidance-131015.pdf,C:\Users\classifier-admin\Desktop\Data\Handbooks,Handbook,,,"[C:, Users, classifier-admin, Desktop, Data, H..."
563,Best in Care Australia - Actions.pdf,C:\Users\classifier-admin\Desktop\Data,Other,,,"[C:, Users, classifier-admin, Desktop, Data]"
564,Bupa Seaforth - Actions.pdf,C:\Users\classifier-admin\Desktop\Data,Other,,,"[C:, Users, classifier-admin, Desktop, Data]"
565,PCA018_Guiding-Principles-for-PC-Aged-Care_W03...,C:\Users\classifier-admin\Desktop\Data,Other,,,"[C:, Users, classifier-admin, Desktop, Data]"


In [2]:
def create_dataset():
    data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../dataset.csv')
    data1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../PathCategorisation.csv')
    df = data.join(data1, (data['filename'] == data1['Document Name']))
    oldColumns = ['filename','filepath','filetype','filesize','filetext','translatedtext','Document Name','Location','Category 1 (Mandatory)','Category 2 (Optional)','Category 3 (Optional)']
    newColumns = ['filename','filepath','filetype','filesize','filetext','translatedtext','DocumentName','Location','Category1(Mandatory)','Category2(Optional)','Category3(Optional)']
    df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df)
    drop_list = ['filename', 'filepath','filetext','Category3(Optional)']
    result = df.select([column for column in df.columns if column not in drop_list])
    #result.show(5)
    return result

df = create_dataset()

In [3]:
df.show(5)

+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|filetype|filesize|      translatedtext|_c0|        DocumentName|            Location|Category1(Mandatory)|Category2(Optional)|                path|
+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|     pdf|  110537|Sector performanc...|566|Sector performanc...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|     pdf|   61331|Bupa Seaforth   T...|564|Bupa Seaforth - A...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|     pdf|  123920|Best in Care Aust...|563|Best in Care Aust...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|     pdf|  626923|Principles for Pa...|565|PCA018_Guiding-Pr...|C:\Users\classifi...|               Other

In [4]:
df = df.na.fill("NotSpecified")#Fill empty
df.show(1)

+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|filetype|filesize|      translatedtext|_c0|        DocumentName|            Location|Category1(Mandatory)|Category2(Optional)|                path|
+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|     pdf|  110537|Sector performanc...|566|Sector performanc...|C:\Users\classifi...|               Other|       NotSpecified|['C:', 'Users', '...|
+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
only showing top 1 row



In [5]:
def train_test_split(dataframe):
    (trainingData, testData) = dataframe.randomSplit([0.8, 0.2], seed = 100)
    return trainingData, testData

In [6]:
def process_train_test_data(trainingData,testData,outputCol):
    @udf("long")
    def num_nonzeros(v):
        return v.numNonzeros()
    testData = testData.where(num_nonzeros(outputCol) != 0)
    trainingData = trainingData.where(num_nonzeros(outputCol) != 0)
    return trainingData,testData

In [7]:
def get_classification_report(dataframe,inputCol,outputCol):
    evaluator = MulticlassClassificationEvaluator(predictionCol=outputCol)
    print("MulticlassEvaluator score: ",evaluator.evaluate(dataframe))
    df = dataframe.select(inputCol,outputCol,"prediction").toPandas()
    print(classification_report(df.label, df.prediction))
    print(accuracy_score(df.label, df.prediction))

In [8]:
def logistic_regression(trainingData,testData):
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)
    return lr.fit(trainingData).transform(testData)

In [9]:
def oneRest(trainingData,testData):
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)
    return ovr.fit(trainingData).transform(testData)

In [10]:
def random_forest_classifier(trainingData,testData,inputCol,outputCol,bins):
    rf = RandomForestClassifier(labelCol=outputCol, \
                            featuresCol=inputCol, \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = bins)

    # Train model with Training Data
    return rf.fit(trainingData).transform(testData)

In [11]:
def naive_bayes(trainingData,testData):
    nb = NaiveBayes(smoothing=1)
    return nb.fit(trainingData).transform(testData)

In [12]:
def get_pipeline1(choice,outCol,*inputCol):
    c = Components()
    allStages = [c.getDocumentAssembler(inputCol[0],"document"),c.getTokenizer("document","tokens"),\
                  c.getNormalizer("tokens","normalized"),\
                c.getFinisher("normalized","finished")]
    if choice==0:#CountVectorizer
        allStages.extend([c.getCountVectorizer("finished","locFeature"),c.getStringIndexer(inputCol[1],"typeFeature"),\
                          c.getVectorAssembler(["locFeature","typeFeature",inputCol[2]],"features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)
    elif choice==1:#Tf-idf
        allStages.extend([c.getTf("finished","tf"),c.getIdf("tf","locFeature"),\
                          c.getStringIndexer(inputCol[1],"typeFeature"),\
                          c.getVectorAssembler(["locFeature","typeFeature",inputCol[2]],"features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)

# For Label 2

Pipeline 1: Using CountVectorizer

In [13]:
inputCol = "path"
outputCol = "Category2(Optional)"
pipeline = get_pipeline1(0,outputCol,inputCol,"filetype","filesize")
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [14]:
{c.name: c.metadata["ml_attr"]["vals"] for c in processed_df.schema.fields if c.name.endswith("label")}

{'label': ['NotSpecified',
  'Feedback',
  'Storyboard',
  'Poster',
  'Booklet',
  'Communique',
  'A Little Yarn',
  'Acacia Living Group Meadow Springs Aged',
  'Fact sheet',
  'ACDMA Aged Hostel',
  'Report',
  'ACH Group Residential Care',
  'Abernethy Nursing Home',
  'Mingarra Hostel',
  'Abel Tasman Village',
  'A.G.Eastwood Hostel',
  'Abbeyfield House Hostel',
  '501 Care Services',
  'Application Document',
  'Abberfield Aged Care Facility',
  '70 Lowe Street',
  'Abbey House Aged Care',
  'Quality and Safety',
  'A H Orr Lodge',
  'Complaints',
  'RSL Menora Gardens Aged Care Facility',
  'Greek',
  'Russian',
  'Polish',
  'Kapara Nursing Home',
  'Macedonian',
  'RSL War Veterans Home Mandurah',
  'Consumer Experience',
  'Hungarian',
  'Perry Park Nursing Home',
  'Arabic',
  'Highercombe',
  'Dutch',
  'Milpara Aged Care Facility',
  'The Abbey Nursing Home',
  'German',
  'Perry Park Hostel',
  'Hindi',
  'Colton Court Nursing',
  'Korean',
  'Serbian',
  'Mental Healt

In [14]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.44      1.00      0.61        24
         1.0       0.95      1.00      0.97        18
         2.0       1.00      0.57      0.73         7
         3.0       1.00      0.60      0.75         5
         4.0       1.00      1.00      1.00         4
         5.0       1.00      1.00      1.00         1
         6.0       1.00      1.00      1.00         5
         8.0       0.00      0.00      0.00         5
         9.0       1.00      1.00      1.00         4
        10.0       0.00      0.00      0.00         1
        13.0       1.00      1.00      1.00         3
        14.0       1.00      1.00      1.00         2
        15.0       1.00      1.00      1.00         1
        17.0       1.00      1.00      1.00         2
        18.0       0.00      0.00      0.00         1
        19.0       1.00      1.00      1.00         2
        20.

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    


# Random Forest
MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.31      1.00      0.47        24
         1.0       0.93      0.72      0.81        18
         2.0       1.00      0.57      0.73         7
         3.0       1.00      0.60      0.75         5
         4.0       1.00      1.00      1.00         4
         5.0       1.00      1.00      1.00         1
         6.0       1.00      1.00      1.00         5
         8.0       0.00      0.00      0.00         5
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         1
        17.0       0.00      0.00      0.00         2
        18.0       0.00      0.00      0.00         1
        19.0       0.00      0.00      0.00         2
        20.0      

In [16]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")


# Naive Bayes
MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.62      0.42      0.50        24
         1.0       0.54      0.72      0.62        18
         2.0       1.00      0.57      0.73         7
         3.0       0.19      1.00      0.31         5
         4.0       1.00      1.00      1.00         4
         5.0       0.17      1.00      0.29         1
         6.0       1.00      1.00      1.00         5
         7.0       0.00      0.00      0.00         0
         8.0       0.00      0.00      0.00         5
         9.0       0.67      0.50      0.57         4
        10.0       0.00      0.00      0.00         1
        11.0       0.00      0.00      0.00         0
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        15.0       0.00      0.00      0.00         1
        17.0       0.00      0.00      0.00         2
        18.0       1

  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.82      0.75      0.78        24
         1.0       1.00      1.00      1.00        18
         2.0       0.58      1.00      0.74         7
         3.0       1.00      0.60      0.75         5
         4.0       1.00      1.00      1.00         4
         5.0       1.00      1.00      1.00         1
         6.0       1.00      1.00      1.00         5
         8.0       0.00      0.00      0.00         5
         9.0       1.00      1.00      1.00         4
        10.0       0.50      1.00      0.67         1
        13.0       1.00      1.00      1.00         3
        14.0       1.00      1.00      1.00         2
        15.0       1.00      1.00      1.00         1
        17.0       0.67      1.00      0.80         2
        18.0       0.33      1.00      0.50         1
        19.0       1.00      1.00      1.00         2
        20.0       1.0

Pipeline 2: Using Tf-idf

In [18]:
inputCol = "path"
outputCol = "Category2(Optional)"
pipeline = get_pipeline1(1,outputCol,inputCol,"filetype","filesize")
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [19]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.41      1.00      0.59        24
         1.0       0.95      1.00      0.97        18
         2.0       1.00      0.57      0.73         7
         3.0       1.00      0.60      0.75         5
         4.0       1.00      1.00      1.00         4
         5.0       1.00      1.00      1.00         1
         6.0       1.00      1.00      1.00         5
         8.0       0.00      0.00      0.00         5
         9.0       1.00      1.00      1.00         4
        10.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         3
        14.0       1.00      1.00      1.00         2
        15.0       1.00      1.00      1.00         1
        17.0       1.00      1.00      1.00         2
        18.0       0.00      0.00      0.00         1
        19.0       1.00      1.00      1.00         2
        20.

In [20]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.25      1.00      0.40        24
         1.0       1.00      0.72      0.84        18
         2.0       0.00      0.00      0.00         7
         3.0       0.00      0.00      0.00         5
         4.0       0.00      0.00      0.00         4
         5.0       0.00      0.00      0.00         1
         6.0       0.00      0.00      0.00         5
         8.0       0.00      0.00      0.00         5
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         1
        17.0       0.00      0.00      0.00         2
        18.0       0.00      0.00      0.00         1
        19.0       0.00      0.00      0.00         2
        20.0      

In [21]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")

# Naive Bayes
MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.23      0.79      0.35        24
         1.0       0.83      0.28      0.42        18
         2.0       0.00      0.00      0.00         7
         3.0       0.00      0.00      0.00         5
         4.0       0.00      0.00      0.00         4
         5.0       0.00      0.00      0.00         1
         6.0       0.00      0.00      0.00         5
         8.0       0.00      0.00      0.00         5
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         1
        17.0       0.00      0.00      0.00         2
        18.0       0.00      0.00      0.00         1
        19.0       0.00      0.00      0.00         2
        20.0       0

In [22]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  0.9999999999999996
              precision    recall  f1-score   support

         0.0       0.82      0.75      0.78        24
         1.0       1.00      1.00      1.00        18
         2.0       0.64      1.00      0.78         7
         3.0       1.00      0.60      0.75         5
         4.0       1.00      1.00      1.00         4
         5.0       1.00      1.00      1.00         1
         6.0       1.00      1.00      1.00         5
         8.0       0.00      0.00      0.00         5
         9.0       1.00      1.00      1.00         4
        10.0       0.50      1.00      0.67         1
        13.0       1.00      1.00      1.00         3
        14.0       1.00      1.00      1.00         2
        15.0       1.00      1.00      1.00         1
        17.0       0.67      1.00      0.80         2
        18.0       0.33      1.00      0.50         1
        19.0       1.00      1.00      1.00         2
        20.0       1.0