In [1]:
from platform import python_version
print(python_version())

3.7.10


In [2]:
import time
import pandas as pd
import numpy as np
import os
from IPython.display import display
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.linear_model import AveragedPerceptronBinaryClassifier
from nimbusml.decomposition import PcaTransformer
from nimbusml import Pipeline

In [3]:
# Load data from package
trainDataFile = get_dataset('gen_twittertrain').as_filepath()
testDataFile = get_dataset('gen_twittertest').as_filepath()
print("Train data file path: " + str(os.path.basename(trainDataFile)))
print("Test data file path: " + str(os.path.basename(testDataFile)))

trainData = pd.read_csv(trainDataFile, sep = "\t")
testData = pd.read_csv(testDataFile, sep = "\t")

trainData.head(20)

Train data file path: train-twitter.gen-sample.tsv
Test data file path: test-twitter.gen-sample.tsv


Unnamed: 0,Sentiment,Text,Label
0,Negative,Oh you are hurting me,0
1,Positive,So long,1
2,Positive,Ths sofa is comfortable,1
3,Negative,The place suck. No?,0
4,Positive,@fakeid &quot;Chillin&quot; I love it!!,1
5,Positive,@fakeid I am heading to the party yeah,1
6,Negative,@fakeid why??,0
7,Negative,@fakeid I need assitense,0
8,Negative,@fakeid You deleted my twitter,0
9,Positive,@fakeid Please enjoy your stay,1


In [4]:
featurizer = NGramFeaturizer(word_feature_extractor=Ngram(weighting = 'TfIdf'))

In [5]:
text_transformed = featurizer.fit_transform(trainData["Text"].to_frame()) # Using one column as input
print(text_transformed.shape)
text_transformed.head(5)

(71, 1007)


Unnamed: 0,Text.Char.<␂>|o|h,Text.Char.o|h|<␠>,Text.Char.h|<␠>|y,Text.Char.<␠>|y|o,Text.Char.y|o|u,Text.Char.o|u|<␠>,Text.Char.u|<␠>|a,Text.Char.<␠>|a|r,Text.Char.a|r|e,Text.Char.r|e|<␠>,...,Text.Word.upset,Text.Word.vocation,Text.Word.flight,Text.Word.late,Text.Word.again,Text.Word.commute,Text.Word.died,Text.Word.cancer,"Text.Word.oh,",Text.Word.finally
0,0.218218,0.218218,0.218218,0.218218,0.218218,0.218218,0.218218,0.218218,0.218218,0.218218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
ag = AveragedPerceptronBinaryClassifier()
ag.fit(text_transformed, 1 * (trainData["Sentiment"] == "Positive"))

Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Training calibrator.
Elapsed time: 00:00:00.5012112


AveragedPerceptronBinaryClassifier()

In [7]:
t0 = time.time()

ppl = Pipeline([
                NGramFeaturizer(word_feature_extractor=Ngram(weighting = 'Tf')), 
                PcaTransformer(rank = 100),
                AveragedPerceptronBinaryClassifier(l2_regularization=0.4,
                                                   number_of_iterations=5),
               ])

ppl.fit(trainData["Text"], trainData["Label"]) #will replace with series if supported

print("Training time: "  + str(round(time.time() - t0, 2)))

Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Training calibrator.
Elapsed time: 00:00:00.2679332
Training time: 0.32


In [8]:
metrics, scores = ppl.test(testData["Text"], testData["Label"], output_scores = True) #replace with series 
print("Performance metrics: ")
display(metrics)
print("Individual scores: ")

# Append origin text to the score
scores["OriginText"] = testData["Text"]
scores["Sentiment"] = testData["Sentiment"]

display(scores[0:5])
print("Total runtime: "  + str(round(time.time() - t0, 2)))

Performance metrics: 


Unnamed: 0,AUC,Accuracy,Positive precision,Positive recall,Negative precision,Negative recall,Log-loss,Log-loss reduction,Test-set entropy (prior Log-Loss/instance),F1 Score,AUPRC
0,0.580762,0.662791,0,0,0.662791,1,1.284392,-0.392864,0.922123,0,0.480434


Individual scores: 


Unnamed: 0,PredictedLabel,Score,Probability,OriginText,Sentiment
0,0,-0.2333,0.255333,@faketwitterid I am sad,Negative
1,0,-0.270544,0.106594,@wakeup_you It is a very simple twit I created,Negative
2,0,-0.245938,0.193317,@anotherfakeid I would love to see the latest ...,Positive
3,0,-0.269664,0.108991,Oh my ladygaga! I haven't played tennis for 2 ...,Negative
4,0,-0.150562,0.781553,I am heading on a road trip and taking a few d...,Positive


Total runtime: 6.13
