In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('CLONG').getOrCreate()

In [0]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, NaiveBayes, LogisticRegression
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer, StandardScaler, Tokenizer, RegexTokenizer, StopWordsRemover, NGram, HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.sql.functions import year, month, dayofmonth, col, udf, length
from pyspark.sql.types import IntegerType
from pyspark.ml.clustering import KMeans
from pyspark.ml.recommendation import ALS

In [0]:
df = sqlContext.sql("SELECT * FROM smsspamcollection_1")
df = df.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')
df.describe().show()

In [0]:
dfb = df.withColumn('length',length(df['text']))
dfb.head(1)

In [0]:
dfb.groupBy('class').mean().show()

In [0]:
tkzr = Tokenizer(inputCol='text',outputCol='token_text')
stopr = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
countVec = CountVectorizer(inputCol='stop_tokens',outputCol='cvec')
idf =IDF(inputCol='cvec',outputCol='tfIdf')
classtonumeric = StringIndexer(inputCol='class',outputCol='label')

In [0]:
cleaner = VectorAssembler(inputCols=['tfIdf','length'], outputCol='features')

In [0]:
dfpipe = Pipeline(stages=[classtonumeric,tkzr,
                         stopr,countVec,idf,cleaner])

In [0]:
model = dfpipe.fit(dfb)

In [0]:
dfc = model.transform(dfb)

In [0]:
fdf = dfc.select('label','features')
fdf.show()

In [0]:
train, test = fdf.randomSplit([.7,.3])

In [0]:
nb = NaiveBayes()
rfc = RandomForestClassifier()
gbt = GBTClassifier()
dtc = DecisionTreeClassifier()
lgr = LogisticRegression()

In [0]:
nbmod = nb.fit(train)
rfcmod = rfc.fit(train)
gbtmod = gbt.fit(train)
dtcmod = dtc.fit(train)
lgrmod = lgr.fit(train)

In [0]:
nbres = nbmod.transform(test)
rfcres = rfcmod.transform(test)
gbtres = gbtmod.transform(test)
dtcres = dtcmod.transform(test)
lgrres = lgrmod.transform(test)

In [0]:
accEval = MulticlassClassificationEvaluator()

In [0]:
print(accEval.evaluate(nbres))
print(accEval.evaluate(rfcres))
print(accEval.evaluate(gbtres))
print(accEval.evaluate(dtcres))
print(accEval.evaluate(lgrres))
