In [1]:
# # Find Spark so that we can access session within our notebook
import findspark
findspark.init()

# Start SparkSession on all available cores
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/14 18:27:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data = spark.read.csv('data/train1.csv',
                      header='true',
                      inferSchema='true',
                      multiLine=True)

                                                                                

In [3]:
print('Total Columns: %d' % len(data.dtypes))
print('Total Rows: %d' % data.count())
data.printSchema()

Total Columns: 5


[Stage 2:>                                                          (0 + 1) / 1]

Total Rows: 20800
root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



                                                                                

In [4]:
data.show()

+---+--------------------+--------------------+--------------------+-----+
| id|               title|              author|                text|label|
+---+--------------------+--------------------+--------------------+-----+
|  0|House Dem Aide: W...|       Darrell Lucus|"House Dem Aide: ...|    1|
|  1|FLYNN: Hillary Cl...|     Daniel J. Flynn|Ever get the feel...|    0|
|  2|Why the Truth Mig...|  Consortiumnews.com|Why the Truth Mig...|    1|
|  3|15 Civilians Kill...|     Jessica Purkiss|Videos 15 Civilia...|    1|
|  4|Iranian woman jai...|      Howard Portnoy|Print An Iranian ...|    1|
|  5|Jackie Mason: Hol...|     Daniel Nussbaum|In these trying t...|    0|
|  6|Life: Life Of Lux...|                null|Ever wonder how B...|    1|
|  7|Benoît Hamon Wins...|     Alissa J. Rubin|PARIS  —   France...|    0|
|  8|Excerpts From a D...|                null|Donald J. Trump i...|    0|
|  9|A Back-Channel Pl...|Megan Twohey and ...|A week before Mic...|    0|
| 10|Obama’s Organizin...

### Check Class Balance

In [5]:
labels = (data.groupBy('label')
             .count()
        )
labels.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----+-----+
|label|count|
+-----+-----+
|    1|10413|
|    0|10387|
+-----+-----+



                                                                                

### Text Preprocessing

In [16]:
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.functions import regexp_replace, array, col, udf, split
from pyspark.ml import Pipeline
from sparknlp.annotator import Lemmatizer
import string

In [7]:
# remove punctuation
punctuation = string.punctuation
punctuation += '—'

def remove_punc(x):
    new_str = x
    for ch in punctuation:
        new_str = new_str.replace(ch, '')
    return new_str

rp_udf = udf(lambda x: remove_punc(x))

data1 = data.withColumn("text_cleaned", rp_udf(col('text')))
data1.show()

[Stage 9:>                                                          (0 + 1) / 1]

+---+--------------------+--------------------+--------------------+-----+--------------------+
| id|               title|              author|                text|label|        text_cleaned|
+---+--------------------+--------------------+--------------------+-----+--------------------+
|  0|House Dem Aide: W...|       Darrell Lucus|"House Dem Aide: ...|    1|House Dem Aide We...|
|  1|FLYNN: Hillary Cl...|     Daniel J. Flynn|Ever get the feel...|    0|Ever get the feel...|
|  2|Why the Truth Mig...|  Consortiumnews.com|Why the Truth Mig...|    1|Why the Truth Mig...|
|  3|15 Civilians Kill...|     Jessica Purkiss|Videos 15 Civilia...|    1|Videos 15 Civilia...|
|  4|Iranian woman jai...|      Howard Portnoy|Print An Iranian ...|    1|Print An Iranian ...|
|  5|Jackie Mason: Hol...|     Daniel Nussbaum|In these trying t...|    0|In these trying t...|
|  6|Life: Life Of Lux...|                null|Ever wonder how B...|    1|Ever wonder how B...|
|  7|Benoît Hamon Wins...|     Alissa J.

                                                                                

In [8]:
# tokenizer
# tokenizer = RegexTokenizer(inputCol='text_cleaned', outputCol="tokens")
# data1 = tokenizer.transform(data)
# data1.show()

In [9]:
# remove stopwords
# remover = StopWordsRemover(inputCol="tokens", outputCol="text_rm_stop")
# data1 = remover.transform(data1)
# data1.show()

### Build Pipeline

In [10]:
tokenizer = RegexTokenizer() \
    .setInputCol("text_cleaned") \
    .setOutputCol("tokens")
stopwords = StopWordsRemover() \
    .setInputCol("tokens") \
    .setOutputCol("text_rm_stop")
nlpPipeline = Pipeline(stages=[tokenizer,
                               stopwords])

In [26]:
final_df = nlpPipeline.fit(data1).transform(data1)

### Lemmatizer

In [27]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

def lems(x):
    words = [lemmatizer.lemmatize(word) for word in x]
    return words

lem_udf = udf(lambda x: lems(x))

final_df = final_df.withColumn("lem", lem_udf(col('text_rm_stop')))
final_df.show()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/carolynliu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/carolynliu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[Stage 13:>                                                         (0 + 1) / 1]

+---+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
| id|               title|              author|                text|label|        text_cleaned|              tokens|        text_rm_stop|                 lem|
+---+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|  0|House Dem Aide: W...|       Darrell Lucus|"House Dem Aide: ...|    1|House Dem Aide We...|[house, dem, aide...|[house, dem, aide...|[house, dem, aide...|
|  1|FLYNN: Hillary Cl...|     Daniel J. Flynn|Ever get the feel...|    0|Ever get the feel...|[ever, get, the, ...|[ever, get, feeli...|[ever, get, feeli...|
|  2|Why the Truth Mig...|  Consortiumnews.com|Why the Truth Mig...|    1|Why the Truth Mig...|[why, the, truth,...|[truth, might, ge...|[truth, might, ge...|
|  3|15 Civilians Kill...|     Jessica Purkiss

                                                                                

In [32]:
final_df = final_df.select(split(col("lem"),",").alias("lem"), col("id"), col("label"))

In [35]:
final_df.printSchema()

root
 |-- lem: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- id: integer (nullable = true)
 |-- label: integer (nullable = true)

