In [0]:
!wget --no-check-certificate 'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz' -O './amazon_review_polarity_csv.tgz'

In [0]:
!tar -xzvf './amazon_review_polarity_csv.tgz'

In [0]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from pyspark.ml import *
from pyspark.ml.classification import *
from pyspark.ml.feature import *
from pyspark.ml.param import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation import *


In [0]:
def transform(n):
  return n-1

In [0]:
raw_train_df = pd.read_csv('amazon_review_polarity_csv/train.csv', header=None, names=('label', 'headline', 'review'))
temp_df=raw_train_df.copy()

raw_train_df['label']=temp_df['label'].apply(transform)
raw_train_df = spark.createDataFrame(raw_train_df[['review', 'label']])

In [0]:
raw_test_df = pd.read_csv('amazon_review_polarity_csv/test.csv', header=None, names=('label', 'headline', 'review'))
temp_df=raw_test_df.copy()

raw_test_df['label']=temp_df['label'].apply(transform)
raw_test_df = spark.createDataFrame(raw_test_df[['review', 'label']])

In [0]:
# tokenizer 
tokenizer = RegexTokenizer(inputCol="review", outputCol="words", pattern="\W")
tokenized_train_df = tokenizer.transform(raw_train_df)

In [0]:
# remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed_train_df = remover.transform(tokenized_train_df)

In [0]:
# Convert to TF words vector
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
TF_train_df = hashingTF.transform(removed_train_df)

In [0]:
# Convert to TF*IDF words vector
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(TF_train_df)
idf_train_df = idfModel.transform(TF_train_df)

In [0]:
lsvc = LinearSVC(featuresCol="features", labelCol="label",maxIter=25) 
svmModel = lsvc.fit(idf_train_df)

In [0]:
tokenized_test_df = tokenizer.transform(raw_test_df)
removed_test_df = remover.transform(tokenized_test_df)
TF_test_df = hashingTF.transform(removed_test_df)
test_idfModel = idf.fit(TF_test_df)
idf_test_df = test_idfModel.transform(TF_test_df)

In [0]:
pred_df = svmModel.transform(idf_test_df)

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(pred_df)
print("Test set accuracy = " + str(accuracy))