In [13]:
import sparknlp
spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
from allComponents import Components
from pyspark.sql import SQLContext
from pyspark import SparkContext
from functools import reduce
from pyspark.sql.functions import udf
from pyspark.ml.classification import LogisticRegression,LinearSVC
from pyspark.ml.classification import RandomForestClassifier, OneVsRest
from pyspark.ml.classification import NaiveBayes,MultilayerPerceptronClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
sc =SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [14]:
def create_dataset():
    data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../dataset.csv')
    data1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../Document Categorisation.csv')
    df = data.join(data1, (data['filename'] == data1['Document Name']))
    oldColumns = ['filename','filepath','filetype','filesize','filetext','translatedtext','Document Name','Location','Category 1 (Mandatory)','Category 2 (Optional)','Category 3 (Optional)']
    newColumns = ['filename','filepath','filetype','filesize','filetext','translatedtext','DocumentName','Location','Category1(Mandatory)','Category2(Optional)','Category3(Optional)']
    df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df)
    drop_list = ['filename', 'filepath','filetext','Category3(Optional)']
    result = df.select([column for column in df.columns if column not in drop_list])
    #result.show(5)
    return result

df = create_dataset()

In [15]:
df = df.na.fill("NotSpecified")#Fill empty
df.show(1)

+--------+--------+--------------------+--------------------+--------------------+--------------------+-------------------+
|filetype|filesize|      translatedtext|        DocumentName|            Location|Category1(Mandatory)|Category2(Optional)|
+--------+--------+--------------------+--------------------+--------------------+--------------------+-------------------+
|     pdf|  110537|Sector performanc...|Sector performanc...|C:\Users\classifi...|               Other|       NotSpecified|
+--------+--------+--------------------+--------------------+--------------------+--------------------+-------------------+
only showing top 1 row



In [16]:
def get_pipeline1(choice,inputCol,outCol):
    c = Components()
    allStages = [c.getDocumentAssembler(inputCol,"document"),c.getTokenizer("document","tokens"), c.getNormalizer("tokens","normalized"),\
                        c.getStopWordCleaner("normalized","cleaned"), c.getStemmer("cleaned","stemmed"),\
                        c.getFinisher("stemmed","finished")]
    if choice==0:#CountVectorizer
        allStages.extend([c.getCountVectorizer("finished","features"),c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)
    elif choice==1:#Tf-idf
        allStages.extend([c.getTf("finished","tf"),c.getIdf("tf","features"),c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)

In [17]:
def get_pipeline2(choice,inputCol,outCol):
    c = Components()
    allStages = [c.getDocumentAssembler(inputCol,"document"),c.getTokenizer("document","tokens"), 
                 c.getNormalizer("tokens","normalized"),c.getStopWordCleaner("normalized","cleaned"), 
                 c.getStemmer("cleaned","stemmed")]
    if choice==0:#Glove Embeddings
        allStages.extend([c.getGloveEmbeddings(["document","stemmed"],"embeddings"),\
                          c.getEmbeddingSentence(["document", "embeddings"],"sentence_embeddings"),\
                          c.getEmbeddingFinisher("sentence_embeddings","finished_sentence_embeddings"),\
                          c.getExplodeVectors("finished_sentence_embeddings","features"),\
                          c.getStringIndexer(outCol,"label")])
        print(allStages)
        return Pipeline(stages=allStages)
    elif choice==1:#BERT Embeddings
        allStages.extend([c.getBERTEmbeddings(["document","stemmed"],"embeddings"),\
                          c.getEmbeddingSentence(["document", "embeddings"],"sentence_embeddings"),\
                          c.getEmbeddingFinisher("sentence_embeddings","finished_sentence_embeddings"),\
                          c.getExplodeVectors("finished_sentence_embeddings","features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)    
    elif choice==2:#ELMO Embeddings
        allStages.extend([c.getELMOEmbeddings(["document","stemmed"],"embeddings"),\
                          c.getEmbeddingSentence(["document", "embeddings"],"sentence_embeddings"),\
                          c.getEmbeddingFinisher("sentence_embeddings","finished_sentence_embeddings"),\
                          c.getExplodeVectors("finished_sentence_embeddings","features"),\
                          c.getStringIndexer(outCol,"label")])
        return Pipeline(stages=allStages)
    elif choice==3:#USE Embeddings
        allStages = [c.getDocumentAssembler(inputCol,"document"),\
                         c.getUSEEmbeddings("document","embeddings"),\
                          c.getEmbeddingFinisher("embeddings","finished_sentence_embeddings"),\
                          c.getExplodeVectors("finished_sentence_embeddings","features"),\
                          c.getStringIndexer(outCol,"label")]
        return Pipeline(stages=allStages)

In [18]:
def train_test_split(dataframe):
    (trainingData, testData) = dataframe.randomSplit([0.8, 0.2], seed = 100)
    return trainingData, testData

In [19]:
def process_train_test_data(trainingData,testData,outputCol):
    @udf("long")
    def num_nonzeros(v):
        return v.numNonzeros()
    testData = testData.where(num_nonzeros(outputCol) != 0)
    trainingData = trainingData.where(num_nonzeros(outputCol) != 0)
    return trainingData,testData

In [20]:
def get_classification_report(dataframe,inputCol,outputCol):
    evaluator = MulticlassClassificationEvaluator(predictionCol=outputCol)
    print("MulticlassEvaluator score: ",evaluator.evaluate(dataframe))
    df = dataframe.select(inputCol,outputCol,"prediction").toPandas()
    print(classification_report(df.label, df.prediction))
    print(accuracy_score(df.label, df.prediction))

In [21]:
def logistic_regression(trainingData,testData):
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)
    return lr.fit(trainingData).transform(testData)

In [22]:
def decision_tree(trainingData,testData,inputCol,outputCol,bins):
    dt = DecisionTreeClassifier(featuresCol = inputCol, labelCol = outputCol, maxDepth = 3,maxBins = bins)
    return dt.fit(trainingData).transform(testData)

In [23]:
def oneRest(trainingData,testData):
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)
    return ovr.fit(trainingData).transform(testData)

In [24]:
def random_forest_classifier(trainingData,testData,inputCol,outputCol,bins):
    rf = RandomForestClassifier(labelCol=outputCol, \
                            featuresCol=inputCol, \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = bins)

    # Train model with Training Data
    return rf.fit(trainingData).transform(testData)

In [25]:
def naive_bayes(trainingData,testData):
    nb = NaiveBayes(smoothing=1)
    return nb.fit(trainingData).transform(testData)



# For Label 1


Pipeline 1. Using CountVectorizer

In [26]:
inputCol = "translatedtext"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline1(0,inputCol,outputCol)
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [28]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.48      1.00      0.65        22
         1.0       1.00      0.79      0.88        14
         2.0       0.50      0.43      0.46         7
         3.0       1.00      0.64      0.78        11
         4.0       0.86      0.86      0.86         7
         5.0       0.86      0.86      0.86         7
         6.0       0.50      0.12      0.20         8
         7.0       1.00      1.00      1.00         1
         8.0       1.00      1.00      1.00         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      0.33      0.50         3
        11.0       1.00      1.00      1.00         5
        12.0       1.00      0.40      0.57         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       1.00      1.00      1.00         2
        20.0       1.00   

  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
#Decision Tree
print("# Decision Tree")
get_classification_report(decision_tree(trainingData,testData,"features","label",bins),"features","label")

# Decision Tree
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.29      1.00      0.44        22
         1.0       1.00      0.57      0.73        14
         2.0       0.00      0.00      0.00         7
         3.0       0.00      0.00      0.00        11
         4.0       0.86      0.86      0.86         7
         5.0       0.78      1.00      0.88         7
         6.0       0.00      0.00      0.00         8
         7.0       1.00      1.00      1.00         1
         8.0       0.00      0.00      0.00         4
         9.0       0.00      0.00      0.00         6
        10.0       0.00      0.00      0.00         3
        11.0       0.00      0.00      0.00         5
        12.0       0.00      0.00      0.00         5
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         2
        20.0       0.00      0.00

In [31]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.35      1.00      0.52        22
         1.0       0.92      0.79      0.85        14
         2.0       0.10      0.14      0.12         7
         3.0       1.00      0.45      0.62        11
         4.0       1.00      1.00      1.00         7
         5.0       0.00      0.00      0.00         7
         6.0       0.00      0.00      0.00         8
         7.0       1.00      1.00      1.00         1
         8.0       0.80      1.00      0.89         4
         9.0       0.00      0.00      0.00         6
        10.0       0.00      0.00      0.00         3
        11.0       1.00      1.00      1.00         5
        12.0       0.00      0.00      0.00         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       0.00      0.00      0.00         2
        20.0       0.00      0.00

In [32]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")
    

# Naive Bayes
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84        22
         1.0       1.00      0.86      0.92        14
         2.0       0.42      0.71      0.53         7
         3.0       0.62      0.45      0.53        11
         4.0       0.64      1.00      0.78         7
         5.0       0.71      0.71      0.71         7
         6.0       0.33      0.25      0.29         8
         7.0       0.50      1.00      0.67         1
         8.0       0.57      1.00      0.73         4
         9.0       0.86      1.00      0.92         6
        10.0       0.50      0.33      0.40         3
        11.0       1.00      1.00      1.00         5
        12.0       0.67      0.40      0.50         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       1.00      0.50      0.67         2
        18.0       0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77        22
         1.0       1.00      0.71      0.83        14
         2.0       0.80      0.57      0.67         7
         3.0       0.89      0.73      0.80        11
         4.0       0.75      0.86      0.80         7
         5.0       0.64      1.00      0.78         7
         6.0       0.33      0.12      0.18         8
         7.0       1.00      1.00      1.00         1
         8.0       0.80      1.00      0.89         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      0.33      0.50         3
        11.0       1.00      1.00      1.00         5
        12.0       0.67      0.80      0.73         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       1.00      1.00      1.00         2
        20.0       1.00      1.00    

Pipeline 2: Using TF-IDF 

In [34]:
inputCol = "translatedtext"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline1(0,inputCol,outputCol)
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
bins = len(df.select(outputCol).distinct().collect())

In [35]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.48      1.00      0.65        22
         1.0       1.00      0.79      0.88        14
         2.0       0.50      0.43      0.46         7
         3.0       1.00      0.64      0.78        11
         4.0       0.86      0.86      0.86         7
         5.0       0.86      0.86      0.86         7
         6.0       0.50      0.12      0.20         8
         7.0       1.00      1.00      1.00         1
         8.0       1.00      1.00      1.00         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      0.33      0.50         3
        11.0       1.00      1.00      1.00         5
        12.0       1.00      0.40      0.57         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       1.00      1.00      1.00         2
        20.0       1.00   

In [36]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.35      1.00      0.52        22
         1.0       0.83      0.71      0.77        14
         2.0       0.17      0.14      0.15         7
         3.0       1.00      0.45      0.62        11
         4.0       1.00      0.71      0.83         7
         5.0       1.00      0.57      0.73         7
         6.0       1.00      0.12      0.22         8
         7.0       1.00      1.00      1.00         1
         8.0       0.67      1.00      0.80         4
         9.0       1.00      0.17      0.29         6
        10.0       0.00      0.00      0.00         3
        11.0       1.00      1.00      1.00         5
        12.0       0.00      0.00      0.00         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       0.00      0.00      0.00         2
        20.0       0.00      0.00

In [37]:
#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")

# Naive Bayes
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84        22
         1.0       1.00      0.86      0.92        14
         2.0       0.42      0.71      0.53         7
         3.0       0.62      0.45      0.53        11
         4.0       0.64      1.00      0.78         7
         5.0       0.71      0.71      0.71         7
         6.0       0.33      0.25      0.29         8
         7.0       0.50      1.00      0.67         1
         8.0       0.57      1.00      0.73         4
         9.0       0.86      1.00      0.92         6
        10.0       0.50      0.33      0.40         3
        11.0       1.00      1.00      1.00         5
        12.0       0.67      0.40      0.50         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       1.00      0.50      0.67         2
        18.0       0.00      0.00  

In [38]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77        22
         1.0       1.00      0.71      0.83        14
         2.0       0.80      0.57      0.67         7
         3.0       0.89      0.73      0.80        11
         4.0       0.75      0.86      0.80         7
         5.0       0.64      1.00      0.78         7
         6.0       0.33      0.12      0.18         8
         7.0       1.00      1.00      1.00         1
         8.0       0.80      1.00      0.89         4
         9.0       1.00      1.00      1.00         6
        10.0       1.00      0.33      0.50         3
        11.0       1.00      1.00      1.00         5
        12.0       0.67      0.80      0.73         5
        13.0       0.00      0.00      0.00         3
        14.0       1.00      0.50      0.67         2
        16.0       1.00      1.00      1.00         2
        20.0       1.00      1.00    

Pipeline 3: Using Glove Embeddings

In [39]:
inputCol = "translatedtext"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline2(0,inputCol,outputCol)
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
trainingData,testData = process_train_test_data(trainingData,testData,"features")
bins = len(df.select(outputCol).distinct().collect())

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
[DocumentAssembler_654a27c1896a, Tokenizer_f381b3e0f9c7, Normalizer_0b4fed457aca, StopWordsCleaner_ab41a111d871, Stemmer_f6ba936ced1e, WORD_EMBEDDINGS_MODEL_48cffc8b9a76, SentenceEmbeddings_7b2f3747b873, EmbeddingsFinisher_fb5ba8138f59, SQLTransformer_7f3e1b63cd34, StringIndexer_e5e656271516]


In [40]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  0.9999999999999998
              precision    recall  f1-score   support

         0.0       0.31      0.88      0.45        17
         1.0       0.48      0.83      0.61        12
         2.0       0.50      0.62      0.56         8
         3.0       0.38      0.75      0.50         4
         4.0       1.00      0.62      0.77         8
         5.0       0.86      0.75      0.80         8
         6.0       0.00      0.00      0.00         6
         7.0       0.00      0.00      0.00         4
         8.0       1.00      0.33      0.50         3
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         4
        11.0       1.00      0.17      0.29         6
        12.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         2
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         1
        16.

In [41]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  0.9999999999999998
              precision    recall  f1-score   support

         0.0       0.35      0.94      0.51        17
         1.0       0.43      0.83      0.57        12
         2.0       0.36      0.62      0.45         8
         3.0       0.75      0.75      0.75         4
         4.0       1.00      0.62      0.77         8
         5.0       1.00      0.62      0.77         8
         6.0       0.00      0.00      0.00         6
         7.0       1.00      0.25      0.40         4
         8.0       0.00      0.00      0.00         3
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         4
        11.0       1.00      0.67      0.80         6
        12.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         2
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         1
        16.0      

In [43]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  0.9999999999999998
              precision    recall  f1-score   support

         0.0       0.65      0.88      0.75        17
         1.0       0.65      0.92      0.76        12
         2.0       0.60      0.38      0.46         8
         3.0       0.43      0.75      0.55         4
         4.0       0.83      0.62      0.71         8
         5.0       0.83      0.62      0.71         8
         6.0       0.00      0.00      0.00         6
         7.0       1.00      0.75      0.86         4
         8.0       0.33      0.67      0.44         3
         9.0       0.67      1.00      0.80         4
        10.0       0.33      0.50      0.40         4
        11.0       1.00      0.67      0.80         6
        12.0       1.00      1.00      1.00         1
        13.0       0.00      0.00      0.00         2
        14.0       1.00      0.50      0.67         2
        15.0       0.00      0.00      0.00         1
        16.0       1.0

Pipeline 4: Using BERT Embeddings

In [44]:
inputCol = "translatedtext"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline2(1,inputCol,outputCol)
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
trainingData,testData = process_train_test_data(trainingData,testData,"features")
bins = len(df.select(outputCol).distinct().collect())

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [45]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.66      0.95      0.78        22
         1.0       0.70      0.88      0.78        16
         2.0       0.35      0.88      0.50         8
         3.0       0.54      0.78      0.64         9
         4.0       0.43      0.38      0.40         8
         5.0       0.75      0.43      0.55         7
         6.0       1.00      0.17      0.29         6
         7.0       1.00      0.50      0.67         2
         8.0       0.50      0.33      0.40         3
         9.0       1.00      1.00      1.00         2
        10.0       0.50      0.33      0.40         3
        11.0       1.00      0.67      0.80         3
        12.0       0.00      0.00      0.00         3
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         2
        17.0       0.00   

In [46]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.33      1.00      0.50        22
         1.0       0.75      0.75      0.75        16
         2.0       0.12      0.25      0.16         8
         3.0       1.00      0.56      0.71         9
         4.0       0.50      0.25      0.33         8
         5.0       0.00      0.00      0.00         7
         6.0       0.00      0.00      0.00         6
         7.0       0.00      0.00      0.00         2
         8.0       0.00      0.00      0.00         3
         9.0       0.00      0.00      0.00         2
        10.0       0.00      0.00      0.00         3
        11.0       0.00      0.00      0.00         3
        12.0       0.00      0.00      0.00         3
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         2
        17.0       0.00      0.00

In [48]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0
              precision    recall  f1-score   support

         0.0       0.66      0.86      0.75        22
         1.0       0.81      0.81      0.81        16
         2.0       0.37      0.88      0.52         8
         3.0       0.67      0.89      0.76         9
         4.0       0.60      0.38      0.46         8
         5.0       0.83      0.71      0.77         7
         6.0       0.40      0.33      0.36         6
         7.0       1.00      1.00      1.00         2
         8.0       0.50      0.33      0.40         3
         9.0       1.00      1.00      1.00         2
        10.0       0.50      0.33      0.40         3
        11.0       1.00      0.67      0.80         3
        12.0       0.00      0.00      0.00         3
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         2
        15.0       0.00      0.00      0.00         2
        17.0       0.00      0.00    

Pipeline 5: Using ELMO Embeddings

In [50]:
inputCol = "translatedtext"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline2(2,inputCol,outputCol)
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
trainingData,testData = process_train_test_data(trainingData,testData,"features")
bins = len(df.select(outputCol).distinct().collect())

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [51]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0000000000000002
              precision    recall  f1-score   support

         0.0       0.81      0.91      0.86        23
         1.0       0.72      0.87      0.79        15
         2.0       0.29      0.82      0.43        11
         3.0       0.83      0.71      0.77         7
         4.0       1.00      0.78      0.88         9
         5.0       0.50      0.75      0.60         4
         6.0       0.00      0.00      0.00         4
         7.0       1.00      0.33      0.50         3
         8.0       0.00      0.00      0.00         2
         9.0       0.60      0.75      0.67         4
        10.0       1.00      0.25      0.40         4
        11.0       1.00      1.00      1.00         5
        12.0       0.00      0.00      0.00         3
        13.0       0.00      0.00      0.00         4
        14.0       0.00      0.00      0.00         1
        15.0       0.00      0.00      0.00         2
        16.

In [52]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  1.0000000000000002
              precision    recall  f1-score   support

         0.0       0.59      1.00      0.74        23
         1.0       0.59      0.87      0.70        15
         2.0       0.25      0.55      0.34        11
         3.0       0.44      0.57      0.50         7
         4.0       1.00      0.67      0.80         9
         5.0       1.00      0.50      0.67         4
         6.0       0.00      0.00      0.00         4
         7.0       1.00      0.33      0.50         3
         8.0       0.00      0.00      0.00         2
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         4
        11.0       1.00      1.00      1.00         5
        12.0       0.00      0.00      0.00         3
        13.0       0.00      0.00      0.00         4
        14.0       0.00      0.00      0.00         1
        15.0       0.00      0.00      0.00         2
        16.0      

In [53]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0000000000000002
              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93        23
         1.0       0.78      0.93      0.85        15
         2.0       0.62      0.73      0.67        11
         3.0       0.67      0.57      0.62         7
         4.0       0.89      0.89      0.89         9
         5.0       0.75      0.75      0.75         4
         6.0       0.17      0.25      0.20         4
         7.0       1.00      0.33      0.50         3
         8.0       0.50      1.00      0.67         2
         9.0       1.00      1.00      1.00         4
        10.0       0.33      0.25      0.29         4
        11.0       1.00      1.00      1.00         5
        12.0       0.67      0.67      0.67         3
        13.0       0.00      0.00      0.00         4
        14.0       0.50      1.00      0.67         1
        15.0       1.00      1.00      1.00         2
        16.0       1.0

Pipeline 5: Using USE Embeddings

In [54]:
inputCol = "translatedtext"
outputCol = "Category1(Mandatory)"
pipeline = get_pipeline2(2,inputCol,outputCol)
processed_df = pipeline.fit(df).transform(df)
trainingData,testData = train_test_split(processed_df)
trainingData,testData = process_train_test_data(trainingData,testData,"features")
bins = len(df.select(outputCol).distinct().collect())

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [55]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")

# Logistic Regression

MulticlassEvaluator score:  1.0000000000000002
              precision    recall  f1-score   support

         0.0       0.81      0.91      0.86        23
         1.0       0.72      0.87      0.79        15
         2.0       0.29      0.82      0.43        11
         3.0       0.83      0.71      0.77         7
         4.0       1.00      0.78      0.88         9
         5.0       0.50      0.75      0.60         4
         6.0       0.00      0.00      0.00         4
         7.0       1.00      0.33      0.50         3
         8.0       0.00      0.00      0.00         2
         9.0       0.60      0.75      0.67         4
        10.0       1.00      0.25      0.40         4
        11.0       1.00      1.00      1.00         5
        12.0       0.00      0.00      0.00         3
        13.0       0.00      0.00      0.00         4
        14.0       0.00      0.00      0.00         1
        15.0       0.00      0.00      0.00         2
        16.

In [56]:
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

# Random Forest
MulticlassEvaluator score:  1.0000000000000002
              precision    recall  f1-score   support

         0.0       0.59      1.00      0.74        23
         1.0       0.59      0.87      0.70        15
         2.0       0.25      0.55      0.34        11
         3.0       0.44      0.57      0.50         7
         4.0       1.00      0.67      0.80         9
         5.0       1.00      0.50      0.67         4
         6.0       0.00      0.00      0.00         4
         7.0       1.00      0.33      0.50         3
         8.0       0.00      0.00      0.00         2
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         4
        11.0       1.00      1.00      1.00         5
        12.0       0.00      0.00      0.00         3
        13.0       0.00      0.00      0.00         4
        14.0       0.00      0.00      0.00         1
        15.0       0.00      0.00      0.00         2
        16.0      

In [57]:
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")

# Onevsrest
MulticlassEvaluator score:  1.0000000000000002
              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93        23
         1.0       0.78      0.93      0.85        15
         2.0       0.62      0.73      0.67        11
         3.0       0.67      0.57      0.62         7
         4.0       0.89      0.89      0.89         9
         5.0       0.75      0.75      0.75         4
         6.0       0.17      0.25      0.20         4
         7.0       1.00      0.33      0.50         3
         8.0       0.50      1.00      0.67         2
         9.0       1.00      1.00      1.00         4
        10.0       0.33      0.25      0.29         4
        11.0       1.00      1.00      1.00         5
        12.0       0.67      0.67      0.67         3
        13.0       0.00      0.00      0.00         4
        14.0       0.50      1.00      0.67         1
        15.0       1.00      1.00      1.00         2
        16.0       1.0

In [None]:
#Logistic Regression
print("# Logistic Regression\n")
get_classification_report(logistic_regression(trainingData,testData),"features","label")
#Random Forest
print("# Random Forest")
get_classification_report(random_forest_classifier(trainingData,testData,"features","label",bins),"features","label")    

#Naive Bayes
print("# Naive Bayes")
get_classification_report(naive_bayes(trainingData,testData),"features","label")
#OneVsRest
print("# Onevsrest")
get_classification_report(oneRest(trainingData,testData),"features","label")