In [1]:
from utils_nlp import Tools
tools = Tools('mhk9c')

installing package 1
installing package 2
installing package 3
installing package 4
installing package 5
installing package 6
installing package 7
Done Installing packages


In [2]:
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.clustering import LDA

from pyspark.sql import functions as F

In [3]:
import sparknlp
spark = sparknlp.start()

### Data Load and Prep

In [5]:
df = tools.load_data(spark, "russian-troll-tweets-enriched")
df = df.withColumn("publish_date_timestamp",F.to_timestamp(F.col("publish_date"),"M/d/yyyy H:mm"))
df = df.withColumn("publish_date_date",F.to_date(F.col("publish_date_timestamp")))
df = df.withColumn("publish_hour", F.hour(F.col("publish_date_timestamp")))

df = df.filter((df["publish_date_date"] >= F.lit("2014-10-14")) & (df["publish_date_date"] <= F.lit("2017-12-14"))) 
df_troll = df.filter(df['label']==1)
df_nontroll = df.filter(df['label']==0)

df_wikileaks = df.filter((df["publish_date_date"] >= F.lit("2016-10-05")) & (df["publish_date_date"] <= F.lit("2016-10-07"))) 
df_wikileaks_troll = df.filter(df['label']==1)

df_utr = df.filter((df["publish_date_date"] >= F.lit("2017-08-01")) & (df["publish_date_date"] <= F.lit("2017-08-31"))) 
df_utr_troll = df.filter(df['label']==1)


# df.printSchema()

Done loading from /project/ds5559/team1_sp22/data//russian-troll-tweets-enriched.


In [6]:
# Spark NLP requires the input dataframe or column to be converted to document. 
document_assembler = DocumentAssembler() \
    .setInputCol("content") \
    .setOutputCol("document") \
    .setCleanupMode("shrink")

In [7]:
# Split sentence to tokens(array)
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")

In [8]:
# clean unwanted characters and garbage
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

In [9]:
# remove stopwords
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

In [10]:
# stem the words to bring them to the root form.
stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

In [11]:
finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

In [12]:
# We build a ml pipeline so that each phase can be executed in sequence. This pipeline can also be used to test the model. 
nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher])

In [15]:
def run_pipeline(_df):
    # train the pipeline
    nlp_model = nlp_pipeline.fit(_df)
    
    # apply the pipeline to transform dataframe.
    processed_df  = nlp_model.transform(_df)
    
    # tokens_df = processed_df.select('publish_date','tokens').limit(10000)
    tokens_df = processed_df.select('publish_date','tokens')
    
    cv = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=500, minDF=3.0)
    
    # train the model
    cv_model = cv.fit(tokens_df)
    
    # transform the data. Output column name will be features.
    vectorized_tokens = cv_model.transform(tokens_df)
    
    num_topics = 3
    lda = LDA(k=num_topics, maxIter=10)
    model = lda.fit(vectorized_tokens)
    ll = model.logLikelihood(vectorized_tokens)
    lp = model.logPerplexity(vectorized_tokens)
    print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
    print("The upper bound on perplexity: " + str(lp))
    
    # extract vocabulary from CountVectorizer
    vocab = cv_model.vocabulary
    topics = model.describeTopics()   
    topics_rdd = topics.rdd
    topics_words = topics_rdd\
           .map(lambda row: row['termIndices'])\
           .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
           .collect()
    for idx, topic in enumerate(topics_words):
        print("topic: {}".format(idx))
        print("*"*25)
        for word in topic:
           print(word)
        print("*"*25)

In [16]:
run_pipeline(df_wikileaks_troll)

The lower bound on the log likelihood of the entire corpus: -26368732.106566556
The upper bound on perplexity: 6.066693609549476
topic: 0
*************************
peopl
black
u
im
dont
like
get
know
white
make
*************************
topic: 1
*************************
trump
break
hillari
new
look
clinton
u
video
presid
cnn
*************************
topic: 2
*************************
rt
new
amp
obama
get
kill
american
realdonaldtrump
man
trump
*************************
