In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
conf = SparkConf()\
        .setMaster("local")\
        .setAppName("My User application")\
        .set("spark.executor.instances","1")\
        .set("spark.executor.cores","1")

from pyspark import SparkContext
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [0]:
# Transformations and actions -> clean the data
# ML models --> sparkMl lib

In [0]:
from pyspark.sql import *
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [0]:
# how to read data from txt file
# how to give input to spark
# csv file
df = spark.read.csv("train.csv", inferSchema=True, header=True)
df.show(1)

+-----+------------+--------------------+
|   id|label_actual|                text|
+-----+------------+--------------------+
|34098|           9|Selena Gomez s Sp...|
+-----+------------+--------------------+
only showing top 1 row



In [0]:
from pyspark.sql import Row
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [0]:
# how do you query documents from sql: selct * from table_name
df.registerTempTable("data_table")

In [0]:
sqlContext.sql('select * from data_table').show(2)

+------+------------+--------------------+
|    id|label_actual|                text|
+------+------------+--------------------+
| 34098|           9|Selena Gomez s Sp...|
|126472|          11|Mitch McConnell S...|
+------+------------+--------------------+
only showing top 2 rows



# UDF

In [0]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [0]:
print("Punctuations :", punctuation)
stop_words = stopwords.words('english')
print("Stopwords :", stop_words[:5])

word = "going"
ps = PorterStemmer()
print("word is = ",word," after stemming = ",ps.stem(word)) # given a word --> root word

sentence = "hello how are you doing ?"
print("sentence = ",sentence," word tokenizer = ",word_tokenize(sentence))

paragraph = "hello how are you doing ? this is amazing . this is cool. "
print("paragraph = ",paragraph," sen tokenizer = ",sent_tokenize(paragraph))

Punctuations : !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Stopwords : ['i', 'me', 'my', 'myself', 'we']
word is =  going  after stemming =  go
sentence =  hello how are you doing ?  word tokenizer =  ['hello', 'how', 'are', 'you', 'doing', '?']
paragraph =  hello how are you doing ? this is amazing . this is cool.   sen tokenizer =  ['hello how are you doing ?', 'this is amazing .', 'this is cool.']


In [0]:
# def clean(text):
#     # clean
#     return text
# df['text'].apply(lambda x: clean(x))

In [0]:
def clean(text):
    #print(text)
    text = text.lower()
    #print(text)
    text = ''.join([char_ for char_ in text if char_ not in punctuation])
    #print(text)
    text_tokens = word_tokenize(text)
    #print(text_tokens)
    text_tokens_without_stopwords = [word for word in text_tokens if word not in stop_words]
    #print(text_tokens_without_stopwords)
    stem_text_tokens = [ps.stem(word) for word in text_tokens_without_stopwords]
    #print(stem_text_tokens)
    cleaned_text = ' '.join(stem_text_tokens)
    #print(cleaned_text)
    return cleaned_text

In [0]:
# clean("Hello! how is your day going on?")

In [0]:
# ls = ["word1", "word2", "word3"]
# '_'.join(ls)

# sentence = "i am doing good"
# ''.join([char for char in sentence if char not in punctuation])


In [0]:
udf_clean_function = udf(clean, StringType())
df = df.withColumn("cleaned_text",udf_clean_function("text"))

In [0]:
df.show(2)

+------+------------+--------------------+--------------------+
|    id|label_actual|                text|        cleaned_text|
+------+------------+--------------------+--------------------+
| 34098|           9|Selena Gomez s Sp...|selena gomez spri...|
|126472|          11|Mitch McConnell S...|mitch mcconnel sa...|
+------+------------+--------------------+--------------------+
only showing top 2 rows



In [0]:
df = df.withColumn("len", f.size(f.split(f.col('cleaned_text'),' ')))

In [0]:
df.show(2)

+------+------------+--------------------+--------------------+---+
|    id|label_actual|                text|        cleaned_text|len|
+------+------------+--------------------+--------------------+---+
| 34098|           9|Selena Gomez s Sp...|selena gomez spri...| 24|
|126472|          11|Mitch McConnell S...|mitch mcconnel sa...| 17|
+------+------------+--------------------+--------------------+---+
only showing top 2 rows



In [0]:
# len_histogram = df.select('len').rdd.flatMap(lambda x: x).histogram(10)
# 1-->90
# 2-->180

# KMeans

In [0]:
df.cache().count()

240498

In [0]:
# KMeans using sckit
# step1: CountVect, TFIDF
# step2: KMeans

In [0]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline

In [0]:
# from sklearn.feature_extraction.text import CountVectorizer, TFIdfVectorizer
# from nltk.corpus import stopwords
# pipeline: Text --> StopWordsRemover -->  Tokenizer --> HashingTF --> IDF --> KMeans

In [0]:
# cleaning and creating features
tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens")
hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=20)
idf = IDF(inputCol="rawFeatures", outputCol="features",minDocFreq=5)

In [0]:
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])

In [0]:
pipeline_model = pipeline.fit(df)

In [0]:
dataset = pipeline_model.transform(df)
dataset.show(5)

+------+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+
|    id|label_actual|                text|        cleaned_text|len|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|
+------+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+
| 34098|           9|Selena Gomez s Sp...|selena gomez spri...| 24|[selena, gomez, s...|  [selena, gomez, s...|(20,[0,1,2,5,6,7,...|(20,[0,1,2,5,6,7,...|
|126472|          11|Mitch McConnell S...|mitch mcconnel sa...| 17|[mitch, mcconnel,...|  [mitch, mcconnel,...|(20,[0,5,7,8,10,1...|(20,[0,5,7,8,10,1...|
|167309|           5|this fast chargin...|fast charg sync m...| 73|[fast, charg, syn...|  [fast, charg, syn...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|214589|           0|exclusive sanjay ...|exclus sanjay dut...| 24|[exclus, 

In [0]:
(traingData, testData) = dataset.randomSplit([0.6,0.4], seed=42)

In [0]:
print("Number of documents in traing dataset :",traingData.count())
print("Number of documents in testing dataset:",testData.count())

Number of documents in traing dataset : 144336
Number of documents in testing dataset: 96162


In [0]:
# training ok Kmeans
kmeans = KMeans(k=17)
kmeans_model = kmeans.fit(traingData)

In [0]:
test_predictions = kmeans_model.transform(testData)

In [0]:
test_predictions.count()

96162

In [0]:
test_predictions.show(2)

+---+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+----------+
| id|label_actual|                text|        cleaned_text|len|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|prediction|
+---+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+----------+
|  0|           9|Blake Lively s We...|blake live wed dr...| 16|[blake, live, wed...|  [blake, live, wed...|(20,[0,1,3,6,8,12...|(20,[0,1,3,6,8,12...|         5|
|  1|           9|Prince William Re...|princ william res...| 21|[princ, william, ...|  [princ, william, ...|(20,[0,1,3,4,5,8,...|(20,[0,1,3,4,5,8,...|        11|
+---+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+----------+
only showing top 2 rows



In [0]:
evaluator = ClusteringEvaluator()
predictions = evaluator.evaluate(test_predictions)

In [0]:
predictions

0.10362893193580504

In [0]:
class_to_id = {'Arts, Culture & Entertainment': 0,
                'Business and Finance': 1,
                            'Crime': 2,
                            'Education': 3,
                            'Environment': 4,
                            'Family & Parenting': 6,
                            'Food & drink': 7,
                            'Health & Fitness': 8,
                            'Home & Living / Lifestyle & Beauty': 9,
                            'Media': 10,
                            'Politics': 11,
                            'Religion': 12,
                            'Society': 13,
                            'Sports': 14,
                            'Travel & Leisure': 15,
                            'Science & Technology': 5,
                            'Automotive':16}

# LR

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
dataset.show(5)

+------+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+
|    id|label_actual|                text|        cleaned_text|len|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|
+------+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+
| 34098|           9|Selena Gomez s Sp...|selena gomez spri...| 24|[selena, gomez, s...|  [selena, gomez, s...|(20,[0,1,2,5,6,7,...|(20,[0,1,2,5,6,7,...|
|126472|          11|Mitch McConnell S...|mitch mcconnel sa...| 17|[mitch, mcconnel,...|  [mitch, mcconnel,...|(20,[0,5,7,8,10,1...|(20,[0,5,7,8,10,1...|
|167309|           5|this fast chargin...|fast charg sync m...| 73|[fast, charg, syn...|  [fast, charg, syn...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|214589|           0|exclusive sanjay ...|exclus sanjay dut...| 24|[exclus, 

In [0]:
lr = LogisticRegression(featuresCol='features', labelCol='label_actual', maxIter=10)
lr_model = lr.fit(traingData)

In [0]:
lr_predictions = lr_model.transform(testData)

In [0]:
lr_predictions.show(5)

+---+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|label_actual|                text|        cleaned_text|len|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+---+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  0|           9|Blake Lively s We...|blake live wed dr...| 16|[blake, live, wed...|  [blake, live, wed...|(20,[0,1,3,6,8,12...|(20,[0,1,3,6,8,12...|[1.25944757240275...|[0.11748610002663...|      11.0|
|  1|           9|Prince William Re...|princ william res...| 21|[princ, william, ...|  [princ, william, ...|(20,[0,1,3,4,5,8,...|(20,[0,1,3,4,5,8,...|[1.75664994375787...|[0.2368726078

# Random Forest

In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
rf = RandomForestClassifier(featuresCol='features', labelCol='label_actual', numTrees=100, maxDepth=4, maxBins=32)

In [0]:
rf_model = rf.fit(traingData)

In [0]:
predictions = rf_model.transform(testData)

In [0]:
predictions.show(5)

+---+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|label_actual|                text|        cleaned_text|len|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+---+------------+--------------------+--------------------+---+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  0|           9|Blake Lively s We...|blake live wed dr...| 16|[blake, live, wed...|  [blake, live, wed...|(20,[0,1,3,6,8,12...|(20,[0,1,3,6,8,12...|[10.4255363326564...|[0.10425536332656...|      11.0|
|  1|           9|Prince William Re...|princ william res...| 21|[princ, william, ...|  [princ, william, ...|(20,[0,1,3,4,5,8,...|(20,[0,1,3,4,5,8,...|[10.9461463724336...|[0.1094614637