In [53]:
import sparknlp
spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
from allComponents import Components
from pyspark.sql import SQLContext
from pyspark import SparkContext
from functools import reduce
from pyspark.sql.functions import udf
from pyspark.ml.classification import LogisticRegression,LinearSVC
from pyspark.ml.classification import RandomForestClassifier, OneVsRest
from pyspark.ml.classification import NaiveBayes,MultilayerPerceptronClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.functions import *
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
sc =SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [54]:
import pandas as pd
dataframe = pd.read_csv('../Document Categorisation.csv',engine="python")
dataframe['path'] = [i.split('\\') for i in dataframe['Location']]
dataframe.to_csv('../PathCategorisation.csv')

In [55]:
dataframe

Unnamed: 0,Document Name,Location,Category 1 (Mandatory),Category 2 (Optional),Category 3 (Optional),path
0,Complaints policy.docx,C:\Users\classifier-admin\Desktop\Data\ACQSC (...,Policy,,,"[C:, Users, classifier-admin, Desktop, Data, A..."
1,abbeyhouseagedcare2610-6.pdf,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Audit,Abbey House Aged Care,,"[C:, Users, classifier-admin, Desktop, Data, A..."
2,aacqa_annual_report_2017-18.pdf,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Annual Report,,,"[C:, Users, classifier-admin, Desktop, Data, A..."
3,aacqa_annual_report_accessibility_17_november_...,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Annual Report,,,"[C:, Users, classifier-admin, Desktop, Data, A..."
4,aacqa_table_a_executive_remuneration.docx,C:\Users\classifier-admin\Desktop\Data\AACQA (...,Annual Report,Executive Remuneration,,"[C:, Users, classifier-admin, Desktop, Data, A..."
...,...,...,...,...,...,...
562,NFP-Principles-and-Guidance-131015.pdf,C:\Users\classifier-admin\Desktop\Data\Handbooks,Handbook,,,"[C:, Users, classifier-admin, Desktop, Data, H..."
563,Best in Care Australia - Actions.pdf,C:\Users\classifier-admin\Desktop\Data,Other,,,"[C:, Users, classifier-admin, Desktop, Data]"
564,Bupa Seaforth - Actions.pdf,C:\Users\classifier-admin\Desktop\Data,Other,,,"[C:, Users, classifier-admin, Desktop, Data]"
565,PCA018_Guiding-Principles-for-PC-Aged-Care_W03...,C:\Users\classifier-admin\Desktop\Data,Other,,,"[C:, Users, classifier-admin, Desktop, Data]"


In [56]:
def create_dataset():
    data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../dataset.csv')
    data1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../PathCategorisation.csv')
    df = data.join(data1, (data['filename'] == data1['Document Name']))
    oldColumns = ['filename','filepath','filetype','filesize','filetext','translatedtext','Document Name','Location','Category 1 (Mandatory)','Category 2 (Optional)','Category 3 (Optional)']
    newColumns = ['filename','filepath','filetype','filesize','filetext','translatedtext','DocumentName','Location','Category1(Mandatory)','Category2(Optional)','Category3(Optional)']
    df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df)
    drop_list = ['filename', 'filepath','filetext','Category3(Optional)']
    result = df.select([column for column in df.columns if column not in drop_list])
    #result.show(5)
    return result

df = create_dataset()

In [57]:
df.show(5)

+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|filetype|filesize|      translatedtext|_c0|        DocumentName|            Location|Category1(Mandatory)|Category2(Optional)|                path|
+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|     pdf|  110537|Sector performanc...|566|Sector performanc...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|     pdf|   61331|Bupa Seaforth   T...|564|Bupa Seaforth - A...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|     pdf|  123920|Best in Care Aust...|563|Best in Care Aust...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|     pdf|  626923|Principles for Pa...|565|PCA018_Guiding-Pr...|C:\Users\classifi...|               Other

In [58]:
df = df.na.fill("NotSpecified")#Fill empty
df.show(1)

+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|filetype|filesize|      translatedtext|_c0|        DocumentName|            Location|Category1(Mandatory)|Category2(Optional)|                path|
+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|     pdf|  110537|Sector performanc...|566|Sector performanc...|C:\Users\classifi...|               Other|       NotSpecified|['C:', 'Users', '...|
+--------+--------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
only showing top 1 row



In [59]:
def train_test_split(dataframe):
    (trainingData, testData) = dataframe.randomSplit([0.8, 0.2], seed = 100)
    return trainingData, testData

In [60]:
def process_train_test_data(trainingData,testData,outputCol):
    @udf("long")
    def num_nonzeros(v):
        return v.numNonzeros()
    testData = testData.where(num_nonzeros(outputCol) != 0)
    trainingData = trainingData.where(num_nonzeros(outputCol) != 0)
    return trainingData,testData

In [61]:
def get_classification_report(dataframe,inputCol,outputCol):
    print(dataframe.show(1))
    evaluator = MulticlassClassificationEvaluator(predictionCol=outputCol)
    print("MulticlassEvaluator score: ",evaluator.evaluate(dataframe))
    df = dataframe.select(inputCol,outputCol,"prediction").toPandas()
    print(classification_report(df.label, df.prediction))
    print(accuracy_score(df.label, df.prediction))

In [62]:
def logistic_regression(trainingData,testData):
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)
    return lr.fit(trainingData).transform(testData)

In [63]:
import sys
def oneRest(trainingData,testData):
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)
    model = ovr.fit(trainingData)
    return model
    #model.save("~/featureLabel1.model")
    #return ovr.fit(trainingData).transform(testData)

In [64]:
def random_forest_classifier(trainingData,testData,inputCol,outputCol,bins):
    rf = RandomForestClassifier(labelCol=outputCol, \
                            featuresCol=inputCol, \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = bins)

    # Train model with Training Data
    return rf.fit(trainingData).transform(testData)

In [65]:
def naive_bayes(trainingData,testData):
    nb = NaiveBayes(smoothing=1)
    return nb.fit(trainingData).transform(testData)

In [66]:
def get_pipeline1(choice,outCol,*inputCol):
    c = Components()
    allStages = [c.getDocumentAssembler(inputCol[0],"document"),c.getTokenizer("document","tokens"),\
                  c.getNormalizer("tokens","normalized"),\
                c.getFinisher("normalized","finished")]
    if choice==0:#CountVectorizer
        allStages.extend([c.getCountVectorizer("finished","locFeature"),c.getStringIndexer(inputCol[1],"typeFeature"),\
                          c.getVectorAssembler(["locFeature","typeFeature",inputCol[2]],"features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)
    elif choice==1:#Tf-idf
        allStages.extend([c.getTf("finished","tf"),c.getIdf("tf","locFeature"),\
                          c.getStringIndexer(inputCol[1],"typeFeature"),\
                          c.getVectorAssembler(["locFeature","typeFeature",inputCol[2]],"features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)

# For Label 1

Pipeline 1: Using CountVectorizer

In [153]:
inputCol = "path"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline1(0,outputCol,inputCol,"filetype","filesize")
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [154]:
{c.name: c.metadata["ml_attr"]["vals"] for c in processed_df.schema.fields if c.name.endswith("label")}

{'label': ['Audit',
  'Draft Guidance Material',
  'Aged Care Quality Standards',
  'Charter of Aged Care Rights',
  'Dept Health',
  'COVID-19',
  'Handbook',
  'Standards Guidance Reference Group',
  'Annual Report',
  'Regulatory Bulletin',
  'Notice of Collection',
  'Self Assessment',
  'Accreditation',
  'Other',
  'Corporate Plan',
  'Newsletter',
  'Key Changes for Providers',
  'Statements ',
  'Memorandum of Understanding',
  'Policy',
  'Sector Performance',
  'Human Resources',
  'Consumer Experience',
  'Clinical Care Standard',
  'Report',
  'Advanced Care Planning',
  'NOUS Report']}

In [69]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    


# Random Forest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.69      1.00      0.81        22
         1.0       1.00      1.00      1.00        14
         2.0       1.00      1.00      1.00         7
         3.0       1.00      1.00      1.00        11
         4.0       1.00      1.00      1.00         7
         5.0       0.33      1.00      0.50         7
         6.0       0.40      0.25      0.31         8
         7.0       1.00      1.00      1.00         1
         8.0       1.00      0.50      0.67         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      1.00      1.00         3
        11.0       0.00      0.00      0.00         5
        12.0       0.00      0.00      0.00         5
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         2
        20.0       0.00      0.00

In [157]:
d

{2.0: 'Storyboard',
 1.0: 'Report',
 0.0: 'Abel Tasman Village',
 20.0: 'NotSpecified',
 4.0: 'Polish',
 12.0: 'Fact sheet',
 10.0: 'Turkish',
 16.0: 'Fact sheet',
 7.0: 'Communique',
 14.0: 'NotSpecified',
 3.0: 'A Little Yarn',
 6.0: 'NotSpecified',
 5.0: 'Storyboard',
 22.0: 'NotSpecified',
 8.0: 'NotSpecified',
 11.0: 'Feedback',
 13.0: 'NotSpecified',
 9.0: 'NotSpecified'}

In [70]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")


# Naive Bayes
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95        22
         1.0       0.92      0.86      0.89        14
         2.0       0.75      0.86      0.80         7
         3.0       1.00      0.73      0.84        11
         4.0       0.44      1.00      0.61         7
         5.0       0.38      0.43      0.40         7
         6.0       0.25      0.12      0.17         8
         7.0       0.25      1.00      0.40         1
         8.0       0.43      0.75      0.55         4
         9.0       1.00      1.00      1.00         6
        10.0       0.43      1.00      0.60         3
        11.0       1.00      0.80      0.89         5
        12.0       0.00      0.00      0.00         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       1.00      0.50      0.67         2
        20.0       0.00      0.00  

In [21]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        22
         1.0       1.00      1.00      1.00        14
         2.0       1.00      1.00      1.00         7
         3.0       1.00      1.00      1.00        11
         4.0       1.00      1.00      1.00         7
         5.0       0.78      1.00      0.88         7
         6.0       1.00      0.75      0.86         8
         7.0       1.00      1.00      1.00         1
         8.0       1.00      1.00      1.00         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         5
        12.0       1.00      1.00      1.00         5
        13.0       1.00      1.00      1.00         3
        14.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        20.0       1.00      1.00    

Pipeline 2: Using Tf-idf

In [72]:
inputCol = "path"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline1(1,outputCol,inputCol,"filetype","filesize")
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [73]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92        22
         1.0       1.00      1.00      1.00        14
         2.0       0.88      1.00      0.93         7
         3.0       1.00      1.00      1.00        11
         4.0       1.00      1.00      1.00         7
         5.0       0.78      1.00      0.88         7
         6.0       1.00      0.62      0.77         8
         7.0       1.00      1.00      1.00         1
         8.0       1.00      1.00      1.00         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         5
        12.0       1.00      1.00      1.00         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        20.0       1.00   

In [74]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.25      1.00      0.40        22
         1.0       1.00      0.93      0.96        14
         2.0       1.00      1.00      1.00         7
         3.0       0.00      0.00      0.00        11
         4.0       0.00      0.00      0.00         7
         5.0       0.00      0.00      0.00         7
         6.0       0.00      0.00      0.00         8
         7.0       0.00      0.00      0.00         1
         8.0       0.00      0.00      0.00         4
         9.0       0.00      0.00      0.00         6
        10.0       0.00      0.00      0.00         3
        11.0       0.00      0.00      0.00         5
        12.0       0.00      0.00      0.00         5
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         2
        20.0       0.00      0.00

In [75]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")

# Naive Bayes
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.80      0.36      0.50        22
         1.0       1.00      0.21      0.35        14
         2.0       0.21      0.43      0.29         7
         3.0       1.00      0.27      0.43        11
         4.0       0.00      0.00      0.00         7
         5.0       0.00      0.00      0.00         7
         6.0       0.10      1.00      0.18         8
         7.0       0.00      0.00      0.00         1
         8.0       0.00      0.00      0.00         4
         9.0       0.00      0.00      0.00         6
        10.0       0.00      0.00      0.00         3
        11.0       0.00      0.00      0.00         5
        12.0       0.00      0.00      0.00         5
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         2
        20.0       0.00      0.00  

In [76]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        22
         1.0       1.00      1.00      1.00        14
         2.0       1.00      1.00      1.00         7
         3.0       1.00      1.00      1.00        11
         4.0       1.00      1.00      1.00         7
         5.0       0.78      1.00      0.88         7
         6.0       1.00      0.75      0.86         8
         7.0       1.00      1.00      1.00         1
         8.0       1.00      1.00      1.00         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         5
        12.0       1.00      1.00      1.00         5
        13.0       1.00      1.00      1.00         3
        14.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        20.0       1.00      1.00    