In [1]:
import sparknlp
spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
from allComponents import Components
from pyspark.sql import SQLContext
from pyspark import SparkContext
from functools import reduce
from pyspark.sql.functions import udf
from pyspark.ml.classification import LogisticRegression,LinearSVC
from pyspark.ml.classification import RandomForestClassifier, OneVsRest
from pyspark.ml.classification import NaiveBayes,MultilayerPerceptronClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.functions import *
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
sc =SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [4]:
def create_dataset():
    data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../neo_data.csv')
    data1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../PathCategorisation.csv')
    df = data.join(data1, (data['nodes'] == data1['Document Name']))
    oldColumns = ['Document Name','Category 1 (Mandatory)','Category 2 (Optional)','Category 3 (Optional)']
    newColumns = ['DocumentName','Category1(Mandatory)','Category2(Optional)','Category3(Optional)']
    df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df)
    drop_list = ['Category3(Optional)']
    result = df.select([column for column in df.columns if column not in drop_list])
    #result.show(5)
    return result

df = create_dataset()

In [5]:
df.show(5)

+--------------------+-----------+--------------------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|               nodes|node_labels|          adj_labels|           adj_nodes|_c0|        DocumentName|            Location|Category1(Mandatory)|Category2(Optional)|                path|
+--------------------+-----------+--------------------+--------------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|Best in Care Aust...|       FILE|['FOLDER', 'CATEG...|   ['Data', 'Other']|563|Best in Care Aust...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|Bupa Seaforth - A...|       FILE|['FOLDER', 'CATEG...|   ['Data', 'Other']|564|Bupa Seaforth - A...|C:\Users\classifi...|               Other|               null|['C:', 'Users', '...|
|PCA018_Guiding-Pr...|       FILE|['FOLDER', 'CATEG...|   ['Data', 'Other']

In [6]:
df = df.na.fill("NotSpecified")#Fill empty
df.show(1)

+--------------------+-----------+--------------------+-----------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|               nodes|node_labels|          adj_labels|        adj_nodes|_c0|        DocumentName|            Location|Category1(Mandatory)|Category2(Optional)|                path|
+--------------------+-----------+--------------------+-----------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
|Best in Care Aust...|       FILE|['FOLDER', 'CATEG...|['Data', 'Other']|563|Best in Care Aust...|C:\Users\classifi...|               Other|       NotSpecified|['C:', 'Users', '...|
+--------------------+-----------+--------------------+-----------------+---+--------------------+--------------------+--------------------+-------------------+--------------------+
only showing top 1 row



In [7]:
def train_test_split(dataframe):
    (trainingData, testData) = dataframe.randomSplit([0.8, 0.2], seed = 100)
    return trainingData, testData

In [8]:
def process_train_test_data(trainingData,testData,outputCol):
    @udf("long")
    def num_nonzeros(v):
        return v.numNonzeros()
    testData = testData.where(num_nonzeros(outputCol) != 0)
    trainingData = trainingData.where(num_nonzeros(outputCol) != 0)
    return trainingData,testData

In [9]:
def get_classification_report(dataframe,inputCol,outputCol):
    evaluator = MulticlassClassificationEvaluator(predictionCol=outputCol)
    print("MulticlassEvaluator score: ",evaluator.evaluate(dataframe))
    df = dataframe.select(inputCol,outputCol,"prediction").toPandas()
    print(classification_report(df.label, df.prediction))
    print(accuracy_score(df.label, df.prediction))

In [10]:
def logistic_regression(trainingData,testData):
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)
    return lr.fit(trainingData).transform(testData)

In [11]:
def oneRest(trainingData,testData):
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)
    return ovr.fit(trainingData).transform(testData)

In [12]:
def random_forest_classifier(trainingData,testData,inputCol,outputCol,bins):
    rf = RandomForestClassifier(labelCol=outputCol, \
                            featuresCol=inputCol, \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = bins)

    # Train model with Training Data
    return rf.fit(trainingData).transform(testData)

In [13]:
def naive_bayes(trainingData,testData):
    nb = NaiveBayes(smoothing=1)
    return nb.fit(trainingData).transform(testData)

In [15]:
def get_pipeline1(choice,outCol,*inputCol):
    c = Components()
    allStages1 = [c.getDocumentAssembler(inputCol[0],"document"),c.getTokenizer("document","tokens"),\
                  c.getNormalizer("tokens","normalized"),\
                c.getFinisher("normalized","finished")]
    allStages2 = [c.getDocumentAssembler(inputCol[2],"document1"),c.getTokenizer("document1","tokens1"),\
                  c.getNormalizer("tokens1","normalized1"),\
                c.getFinisher("normalized1","finished1")]
    allStages = allStages1+allStages2
    if choice==0:#CountVectorizer
        allStages.extend([c.getCountVectorizer("finished","locFeature"),\
                          c.getCountVectorizer("finished1","adjFeature"),\
                          c.getStringIndexer(inputCol[1],"typeFeature"),\
                          c.getVectorAssembler(["locFeature","typeFeature","adjFeature"],"features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)
    elif choice==1:#Tf-idf
        allStages.extend([c.getTf("finished","tf"),c.getIdf("tf","locFeature"),\
                          c.getTf("finished1","tf1"),c.getIdf("tf1","adjFeature"),\
                          c.getStringIndexer(inputCol[1],"typeFeature"),\
                          c.getVectorAssembler(["locFeature","typeFeature","adjFeature"],"features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)

# For Label 1

Pipeline 1: Using CountVectorizer

In [16]:
inputCol = "nodes"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline1(0,outputCol,inputCol,"node_labels","adj_nodes")
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [17]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96        22
         1.0       1.00      1.00      1.00        18
         2.0       1.00      1.00      1.00         9
         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00         9
         5.0       1.00      1.00      1.00         7
         6.0       1.00      1.00      1.00         4
         7.0       1.00      1.00      1.00         3
         8.0       0.67      1.00      0.80         2
         9.0       1.00      1.00      1.00         4
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         3
        12.0       1.00      1.00      1.00         3
        15.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        17.0       1.00      1.00      1.00         1
        18.0       1.00   

  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    


# Random Forest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.68        22
         1.0       1.00      1.00      1.00        18
         2.0       1.00      1.00      1.00         9
         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00         9
         5.0       1.00      1.00      1.00         7
         6.0       1.00      0.25      0.40         4
         7.0       1.00      1.00      1.00         3
         8.0       1.00      0.50      0.67         2
         9.0       1.00      1.00      1.00         4
        10.0       1.00      1.00      1.00         3
        11.0       0.00      0.00      0.00         3
        12.0       0.00      0.00      0.00         3
        15.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         2
        17.0       0.00      0.00      0.00         1
        18.0       0.00      0.00

In [21]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")


# Naive Bayes
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        22
         1.0       1.00      1.00      1.00        18
         2.0       0.90      1.00      0.95         9
         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00         9
         5.0       1.00      1.00      1.00         7
         6.0       1.00      1.00      1.00         4
         7.0       1.00      1.00      1.00         3
         8.0       0.67      1.00      0.80         2
         9.0       1.00      1.00      1.00         4
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         3
        12.0       1.00      1.00      1.00         3
        15.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        17.0       1.00      1.00      1.00         1
        18.0       1.00      1.00  

In [22]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        22
         1.0       1.00      1.00      1.00        18
         2.0       1.00      1.00      1.00         9
         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00         9
         5.0       1.00      1.00      1.00         7
         6.0       1.00      1.00      1.00         4
         7.0       1.00      1.00      1.00         3
         8.0       1.00      1.00      1.00         2
         9.0       1.00      1.00      1.00         4
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         3
        12.0       1.00      1.00      1.00         3
        15.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        17.0       1.00      1.00      1.00         1
        18.0       1.00      1.00    

Pipeline 2: Using Tf-idf

In [23]:
inputCol = "nodes"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline1(1,outputCol,inputCol,"node_labels","adj_nodes")
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [24]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96        22
         1.0       1.00      1.00      1.00        18
         2.0       1.00      1.00      1.00         9
         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00         9
         5.0       1.00      1.00      1.00         7
         6.0       1.00      1.00      1.00         4
         7.0       1.00      1.00      1.00         3
         8.0       0.67      1.00      0.80         2
         9.0       1.00      1.00      1.00         4
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         3
        12.0       1.00      1.00      1.00         3
        15.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        17.0       1.00      1.00      1.00         1
        18.0       1.00   

In [25]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.20      1.00      0.34        22
         1.0       0.00      0.00      0.00        18
         2.0       0.00      0.00      0.00         9
         3.0       0.00      0.00      0.00        10
         4.0       0.00      0.00      0.00         9
         5.0       0.00      0.00      0.00         7
         6.0       0.00      0.00      0.00         4
         7.0       0.00      0.00      0.00         3
         8.0       0.00      0.00      0.00         2
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         3
        11.0       0.00      0.00      0.00         3
        12.0       0.00      0.00      0.00         3
        15.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         2
        17.0       0.00      0.00      0.00         1
        18.0       0.00      0.00

In [26]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")

# Naive Bayes
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        22
         1.0       1.00      1.00      1.00        18
         2.0       0.90      1.00      0.95         9
         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00         9
         5.0       1.00      1.00      1.00         7
         6.0       1.00      1.00      1.00         4
         7.0       1.00      1.00      1.00         3
         8.0       0.67      1.00      0.80         2
         9.0       1.00      1.00      1.00         4
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         3
        12.0       1.00      1.00      1.00         3
        15.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        17.0       1.00      1.00      1.00         1
        18.0       1.00      1.00  

In [27]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        22
         1.0       1.00      1.00      1.00        18
         2.0       1.00      1.00      1.00         9
         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00         9
         5.0       1.00      1.00      1.00         7
         6.0       1.00      1.00      1.00         4
         7.0       1.00      1.00      1.00         3
         8.0       1.00      1.00      1.00         2
         9.0       1.00      1.00      1.00         4
        10.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         3
        12.0       1.00      1.00      1.00         3
        15.0       1.00      1.00      1.00         2
        16.0       1.00      1.00      1.00         2
        17.0       1.00      1.00      1.00         1
        18.0       1.00      1.00    