In [8]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, StringIndexer
from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
import pandas as pd
from pyspark.ml.tuning import TrainValidationSplit

In [3]:
spark = SparkSession.builder.appName("text_classification").getOrCreate()

In [5]:
df = spark.read.option("header", "true").csv(r"D:\subject\nlp\code\natural_language_processing\data\sentiments.csv")
df.show()

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|Kickers on my wat...|        1|
|user: AAP MOVIE. ...|        1|
|user I'd be afrai...|        1|
|   MNTA Over 12.00  |        1|
|    OI  Over 21.37  |        1|
|   PGNX  Over 3.04  |        1|
|AAP - user if so ...|       -1|
|Monday's relative...|       -1|
|GOOG - ower trend...|        1|
|AAP will watch to...|        1|
|i'm assuming FCX ...|        1|
|It really worries...|        1|
|AAP GAMCO's arry ...|        1|
|user Maykiljil po...|        1|
|Momentum is comin...|        1|
|HA Hitting 35.65 ...|        1|
|user gameplan sho...|        1|
|with FCX gapping ...|        1|
|user great list a...|        1|
|ATHX upper trend ...|        1|
+--------------------+---------+
only showing top 20 rows



In [6]:
df = df.na.drop()


In [9]:
index = StringIndexer(inputCol="sentiment", outputCol="label")
tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
vectorizer = CountVectorizer(inputCol="words", outputCol="features")

pipeline = Pipeline(stages=[index, tokenizer, vectorizer])
model = pipeline.fit(df)

df = model.transform(df)
df.show()


+--------------------+---------+-----+--------------------+--------------------+
|                text|sentiment|label|               words|            features|
+--------------------+---------+-----+--------------------+--------------------+
|Kickers on my wat...|        1|  0.0|[kickers, on, my,...|(15883,[3,36,67,7...|
|user: AAP MOVIE. ...|        1|  0.0|[user:, aap, movi...|(15883,[0,6,7,20,...|
|user I'd be afrai...|        1|  0.0|[user, i'd, be, a...|(15883,[1,2,4,8,1...|
|   MNTA Over 12.00  |        1|  0.0| [mnta, over, 12.00]|(15883,[21,1944,1...|
|    OI  Over 21.37  |        1|  0.0|   [oi, over, 21.37]|(15883,[21,492,14...|
|   PGNX  Over 3.04  |        1|  0.0|  [pgnx, over, 3.04]|(15883,[21,6257,1...|
|AAP - user if so ...|       -1|  1.0|[aap, -, user, if...|(15883,[0,2,4,7,1...|
|Monday's relative...|       -1|  1.0|[monday's, relati...|(15883,[340,430,1...|
|GOOG - ower trend...|        1|  0.0|[goog, -, ower, t...|(15883,[10,31,41,...|
|AAP will watch to...|      

# Classification with logistic regression