# NLP with SPark

In [1]:
import findspark
findspark.init('/opt/spark/')

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('nlp_spark_moynihanl').getOrCreate()

22/04/08 11:34:21 WARN Utils: Your hostname, GPUServer resolves to a loopback address: 127.0.1.1; using 10.4.10.8 instead (on interface enp2s0)
22/04/08 11:34:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/08 11:34:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/08 11:34:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/08 11:34:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/04/08 11:34:22 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/04/08 11:34:22 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/04/08 11:34:22 W

## 1 . Tokenizer

In [4]:
#corpus as input and trying to take a line at a time and dividing into words

In [5]:
import pandas as pd
corpus = pd.read_pickle('corpus.pkl')

In [6]:
corpus

Unnamed: 0,transcript,full_name
ali,ladies and gentlemen please welcome to the sta...,Ali Wong
anthony,thank you thank you thank you san francisco th...,Anthony Jeselnik
bill,all right thank you thank you very much thank...,Bill Burr
bo,bo what old macdonald had a farm e i e i o and...,Bo Burnham
dave,this is dave he tells dirty jokes for a living...,Dave Chappelle
hasan,whats up davis whats up im home i had to bri...,Hasan Minhaj
jim,ladies and gentlemen please welcome to the ...,Jim Jefferies
joe,ladies and gentlemen welcome joe rogan wha...,Joe Rogan
john,all right petunia wish me luck out there you w...,John Mulaney
louis,intro fade the music out lets roll hold there ...,Louis C.K.


In [7]:
corpus.reset_index(inplace=True)
corpus

Unnamed: 0,index,transcript,full_name
0,ali,ladies and gentlemen please welcome to the sta...,Ali Wong
1,anthony,thank you thank you thank you san francisco th...,Anthony Jeselnik
2,bill,all right thank you thank you very much thank...,Bill Burr
3,bo,bo what old macdonald had a farm e i e i o and...,Bo Burnham
4,dave,this is dave he tells dirty jokes for a living...,Dave Chappelle
5,hasan,whats up davis whats up im home i had to bri...,Hasan Minhaj
6,jim,ladies and gentlemen please welcome to the ...,Jim Jefferies
7,joe,ladies and gentlemen welcome joe rogan wha...,Joe Rogan
8,john,all right petunia wish me luck out there you w...,John Mulaney
9,louis,intro fade the music out lets roll hold there ...,Louis C.K.


In [8]:
corpus = spark.createDataFrame(corpus)

In [10]:
corpus.show(truncate = 50)

+-------+--------------------------------------------------+----------------+
|  index|                                        transcript|       full_name|
+-------+--------------------------------------------------+----------------+
|    ali|ladies and gentlemen please welcome to the stag...|        Ali Wong|
|anthony|thank you thank you thank you san francisco tha...|Anthony Jeselnik|
|   bill| all right thank you thank you very much thank ...|       Bill Burr|
|     bo|bo what old macdonald had a farm e i e i o and ...|      Bo Burnham|
|   dave|this is dave he tells dirty jokes for a living ...|  Dave Chappelle|
|  hasan|  whats up davis whats up im home i had to brin...|    Hasan Minhaj|
|    jim|   ladies and gentlemen please welcome to the s...|   Jim Jefferies|
|    joe|   ladies and gentlemen welcome joe rogan  what...|       Joe Rogan|
|   john|all right petunia wish me luck out there you wi...|    John Mulaney|
|  louis|intro fade the music out lets roll hold there l...|    

In [12]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

In [14]:
tokenizer = Tokenizer(inputCol = 'transcript', outputCol = 'words')

regexTokenizer = RegexTokenizer(inputCol = 'transcript', outputCol = 'words',
                               pattern = '\\W')

In [15]:
countTokens = F.udf(lambda words: len(words), IntegerType())

In [18]:
tokenized = tokenizer.transform(corpus)

tokenized.select(['transcript', 'words']).withColumn('tokens', countTokens(tokenized['words'])).show()

+--------------------+--------------------+------+
|          transcript|               words|tokens|
+--------------------+--------------------+------+
|ladies and gentle...|[ladies, and, gen...|  7369|
|thank you thank y...|[thank, you, than...|  6720|
| all right thank ...|[, all, right, th...| 12180|
|bo what old macdo...|[bo, what, old, m...|  6929|
|this is dave he t...|[this, is, dave, ...|  9166|
|  whats up davis ...|[, , whats, up, d...| 10413|
|   ladies and gen...|[, , , ladies, an...| 11016|
|   ladies and gen...|[, , , ladies, an...|  9923|
|all right petunia...|[all, right, petu...|  9302|
|intro fade the mu...|[intro, fade, the...|  7487|
|wow hey thank you...|[wow, hey, thank,...| 11454|
|hello hello how y...|[hello, hello, ho...| 10529|
+--------------------+--------------------+------+





In [20]:
regexTokenized = regexTokenizer.transform(corpus)

regexTokenized.select(['transcript', 'words']).withColumn('tokens', countTokens(regexTokenized['words'])).show()

+--------------------+--------------------+------+
|          transcript|               words|tokens|
+--------------------+--------------------+------+
|ladies and gentle...|[ladies, and, gen...|  7339|
|thank you thank y...|[thank, you, than...|  6675|
| all right thank ...|[all, right, than...| 12040|
|bo what old macdo...|[bo, what, old, m...|  6615|
|this is dave he t...|[this, is, dave, ...|  9010|
|  whats up davis ...|[whats, up, davis...| 10267|
|   ladies and gen...|[ladies, and, gen...| 10855|
|   ladies and gen...|[ladies, and, gen...|  9826|
|all right petunia...|[all, right, petu...|  9240|
|intro fade the mu...|[intro, fade, the...|  7455|
|wow hey thank you...|[wow, hey, thank,...| 11400|
|hello hello how y...|[hello, hello, ho...| 10539|
+--------------------+--------------------+------+



## Stop Words Removal

In [27]:
import pickle

cv_no_stop = pickle.load(open('cv_no_stop.pkl', 'rb'))

stop_words = list(cv_no_stop.get_stop_words())

In [28]:
stop_words

['couldnt',
 'others',
 'whereafter',
 'keep',
 'had',
 'formerly',
 'thick',
 'were',
 'somehow',
 'often',
 'per',
 'enough',
 'thereby',
 'see',
 'first',
 'of',
 'thus',
 'although',
 'above',
 'last',
 'do',
 'whereas',
 'hence',
 'amongst',
 'whatever',
 'well',
 'four',
 'whoever',
 'else',
 'thru',
 'nowhere',
 'same',
 'detail',
 'other',
 'its',
 'whether',
 'dont',
 'side',
 'which',
 'i',
 'latterly',
 'anyone',
 'amoungst',
 'by',
 'beyond',
 'co',
 'via',
 'behind',
 'and',
 'hereupon',
 'toward',
 'ltd',
 'became',
 'perhaps',
 'think',
 'oh',
 'more',
 'afterwards',
 'we',
 'seemed',
 'still',
 'move',
 'off',
 'go',
 'seem',
 'after',
 'until',
 'somewhere',
 'describe',
 'with',
 'eight',
 'sometimes',
 'own',
 'one',
 'time',
 'whereby',
 'throughout',
 'wherein',
 'just',
 'yours',
 'cry',
 'may',
 'at',
 'will',
 'upon',
 'fifteen',
 'towards',
 'hasnt',
 'it',
 'not',
 'nor',
 'latter',
 'she',
 'no',
 'forty',
 'less',
 'hundred',
 'a',
 'can',
 'noone',
 'again'

In [29]:
from pyspark.ml.feature import StopWordsRemover

In [30]:
remover = StopWordsRemover(inputCol = 'words',outputCol = 'no_stop_words',
                          stopWords = stop_words
                          )

In [31]:
tokenized_no_stop = remover.transform(regexTokenized)

In [32]:
tokenized_no_stop.show()

+-------+--------------------+----------------+--------------------+--------------------+
|  index|          transcript|       full_name|               words|       no_stop_words|
+-------+--------------------+----------------+--------------------+--------------------+
|    ali|ladies and gentle...|        Ali Wong|[ladies, and, gen...|[ladies, gentleme...|
|anthony|thank you thank y...|Anthony Jeselnik|[thank, you, than...|[thank, thank, th...|
|   bill| all right thank ...|       Bill Burr|[all, right, than...|[thank, thank, th...|
|     bo|bo what old macdo...|      Bo Burnham|[bo, what, old, m...|[bo, old, macdona...|
|   dave|this is dave he t...|  Dave Chappelle|[this, is, dave, ...|[dave, tells, dir...|
|  hasan|  whats up davis ...|    Hasan Minhaj|[whats, up, davis...|[whats, davis, wh...|
|    jim|   ladies and gen...|   Jim Jefferies|[ladies, and, gen...|[ladies, gentleme...|
|    joe|   ladies and gen...|       Joe Rogan|[ladies, and, gen...|[ladies, gentleme...|
|   john|a

# N-grams

In [33]:
from pyspark.ml.feature import NGram

In [40]:
ngram = NGram(n = 2, inputCol='no_stop_words',outputCol='bi-grams')

ngramDF = ngram.transform(tokenized_no_stop)
ngramDF.select('bi-grams').show(truncate=60)

+------------------------------------------------------------+
|                                                    bi-grams|
+------------------------------------------------------------+
|[ladies gentlemen, gentlemen welcome, welcome stage, stag...|
|[thank thank, thank thank, thank san, san francisco, fran...|
|[thank thank, thank thank, thank thank, thank thank, than...|
|[bo old, old macdonald, macdonald farm, farm e, e e, e o,...|
|[dave tells, tells dirty, dirty jokes, jokes living, livi...|
|[whats davis, davis whats, whats home, home bring, bring ...|
|[ladies gentlemen, gentlemen welcome, welcome stage, stag...|
|[ladies gentlemen, gentlemen welcome, welcome joe, joe ro...|
|[petunia wish, wish luck, luck die, die august, august pr...|
|[intro fade, fade music, music lets, lets roll, roll hold...|
|[wow hey, hey thank, thank thanks, thanks thank, thank gu...|
|[hello hello, hello doing, doing great, great thank, than...|
+------------------------------------------------------

## Feature Extractors
#### TFIDF 

In [42]:
from pyspark.ml.feature import HashingTF, IDF

In [46]:
hashingTF = HashingTF(inputCol='no_stop_words', outputCol='tf', numFeatures = 1000)

tfData = hashingTF.transform(ngramDF)

In [48]:
idf = IDF(inputCol = 'tf', outputCol = 'tfidf', minDocFreq = 3)

idfModel = idf.fit(tfData)

rescaledData = idfModel.transform(tfData)

In [49]:
rescaledData.select(['index','tfidf']).show(truncate=50)

+-------+--------------------------------------------------+
|  index|                                             tfidf|
+-------+--------------------------------------------------+
|    ali|(1000,[0,1,2,3,4,5,7,9,10,11,15,16,19,21,22,23,...|
|anthony|(1000,[0,2,3,7,8,10,12,15,18,20,21,22,23,25,27,...|
|   bill|(1000,[0,1,2,3,4,5,6,7,10,11,13,14,15,16,17,21,...|
|     bo|(1000,[0,1,2,6,7,9,12,13,15,16,18,19,21,22,23,2...|
|   dave|(1000,[0,1,2,4,6,7,9,10,11,12,14,16,20,21,22,23...|
|  hasan|(1000,[0,1,2,5,7,10,12,14,15,16,17,18,19,20,22,...|
|    jim|(1000,[0,1,3,7,8,9,10,12,14,15,16,17,18,19,20,2...|
|    joe|(1000,[0,1,2,7,10,13,14,16,19,21,22,23,24,25,27...|
|   john|(1000,[0,1,3,4,7,9,10,14,15,16,21,22,23,24,25,2...|
|  louis|(1000,[0,2,3,5,9,11,12,14,15,16,17,20,22,24,25,...|
|   mike|(1000,[0,1,4,6,7,8,9,10,12,13,14,16,19,20,21,22...|
|  ricky|(1000,[0,1,2,3,4,5,6,7,10,11,12,16,17,18,19,20,...|
+-------+--------------------------------------------------+



### CountVecorizer

In [50]:
from pyspark.ml.feature import CountVectorizer

In [52]:
cv = CountVectorizer(inputCol = 'no_stop_words', outputCol = 'counts', 
                    vocabSize=1000,
                    minDF = 3.0,
                    maxDF = 10,
                    minTF = 3.0)

In [53]:
cvModel = cv.fit(ngramDF)

result = cvModel.transform(ngramDF)

In [55]:
result.select(['index','counts']).show(truncate=50)

+-------+--------------------------------------------------+
|  index|                                            counts|
+-------+--------------------------------------------------+
|    ali|(1000,[0,3,4,5,6,8,9,10,11,12,13,14,17,18,19,20...|
|anthony|(1000,[0,1,2,3,5,7,8,10,11,12,13,15,16,18,21,24...|
|   bill|(1000,[0,1,2,3,4,5,6,7,10,11,15,16,18,20,21,25,...|
|     bo|(1000,[2,3,5,7,9,10,12,13,19,21,23,24,25,26,29,...|
|   dave|(1000,[0,1,2,4,6,10,12,14,16,17,21,22,23,24,25,...|
|  hasan|(1000,[1,2,3,4,8,9,10,11,12,14,15,16,18,21,23,2...|
|    jim|(1000,[0,1,2,3,5,7,9,12,13,14,16,17,18,20,21,22...|
|    joe|(1000,[0,1,2,3,4,5,6,8,9,10,11,13,14,15,16,17,2...|
|   john|(1000,[0,1,2,3,6,8,11,12,13,14,15,18,20,21,22,2...|
|  louis|(1000,[0,1,3,4,5,6,10,11,14,15,17,19,20,24,32,3...|
|   mike|(1000,[0,2,6,9,11,12,14,15,16,17,18,23,24,27,29...|
|  ricky|(1000,[0,1,2,3,5,7,9,11,13,14,15,18,20,21,23,25...|
+-------+--------------------------------------------------+

