In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit
from pyspark.sql import functions as F
from pyspark.mllib.stat import Statistics

from utils import Tools
tools = Tools('mhk9c')
spark = tools.spark

sc = spark.sparkContext

import os
import glob

# Create features
import emoji
import re
import datetime


import demoji 
demoji.download_codes()
import tldextract



In [3]:
# df = tools.create_df_from_csv("russian-troll-tweets-master")
# tools.save_df(df, 'russian-troll-tweets')

In [4]:
# df = tools.load_data("russian-troll-tweets")
# df_english = df.filter(df['language']=='English')
# tools.save_df(df_english, "russian-troll-tweets-english-only")

In [5]:
df = tools.load_data("russian-troll-tweets-english-only")

Done loading from /project/ds5559/team1_sp22/data//russian-troll-tweets-english-only.


In [6]:
df.printSchema()

root
 |-- external_author_id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- content: string (nullable = true)
 |-- region: string (nullable = true)
 |-- language: string (nullable = true)
 |-- publish_date: string (nullable = true)
 |-- harvested_date: string (nullable = true)
 |-- following: integer (nullable = true)
 |-- followers: integer (nullable = true)
 |-- updates: integer (nullable = true)
 |-- post_type: string (nullable = true)
 |-- account_type: string (nullable = true)
 |-- retweet: integer (nullable = true)
 |-- account_category: string (nullable = true)
 |-- new_june_2018: integer (nullable = true)
 |-- alt_external_id: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- article_url: string (nullable = true)
 |-- tco1_step1: string (nullable = true)
 |-- tco2_step1: string (nullable = true)
 |-- tco3_step1: string (nullable = true)



### Data Munging after this

In [7]:
import pyspark.sql.functions as func
from pyspark.sql.types import StringType, ArrayType, IntegerType
import re
rx_b = re.compile(r"@[a-zA-Z0-9]+")
rx_url = re.compile(r"(?:http|ftp|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])")


# *************************************************************
def convert_emojii(string): 
    '''
    convert emoji to string representation with demoji
    '''
    try:
        return demoji.replace_with_desc(string, ":")
    except:
        return "COULD NOT CONVERT EMOJII"
convert_emojii_UDF = func.udf(lambda z:convert_emojii(z),StringType())   
# test = convert_emojii("🐝🐝🐝")   
# print(test)

# *************************************************************
def extract_domain_information(url):  
    '''
    Extract domain information with tldextract
        Attempts to get registered domain if not parses out domain information from url
    '''
    try:        
        if(url):            
            ext = tldextract.extract(url)
            if(ext.registered_domain):                
                return ext.registered_domain
            else :                
                return f'{ext.subdomain}.{ext.domain}.{ext.suffix}'                
        else:            
            return "NA"        
    except Exception as e:        
        return "NA"    
extract_domain_information_UDF = func.udf(lambda z:extract_domain_information(z),StringType())   
# test = extract_domain_information("http://yhoo.it/1QjSSWw")

# *************************************************************
def extract_handles(content): 
    '''
        gets all the handles in the tweet of the form @[a-zA-Z0-9]+ and returns an array
    '''
    try:
        if(content is not None):        
            result = re.findall(rx_b, content) 
            return result
        else:
            return []
    except:
        return []    
extract_handles_UDF = func.udf(lambda z:extract_handles(z),ArrayType(StringType(), True))   
# test = extract_handles("Hi @MichelleObama , remember when you praised Harvey Weinstein as 'a wonderful human being, a good friend and a powerhouse.")
# print(test)

# *************************************************************
def count_emoji(string):
    '''
    Count number of emojis within a string
    '''
    if string:
        return emoji.emoji_count(string)
    else:
        return 0
count_emoji_udf = func.udf(lambda x: count_emoji(x), IntegerType())

# *************************************************************
def extract_emoji(string):
    '''
    Extract emojis by converting them to text
    '''
    if string:
        return emoji.demojize(emoji.distinct_emoji_lis(string))
    else:
        return 'None'
extract_emoji_udf = func.udf(lambda x: extract_emoji(x), StringType())

# *************************************************************
def extract_urls(string):
    '''
    Extract all urls in string
    '''
    if string:
#         urls = re.findall('(?:(?:https?|ftp):\\/\\/)?[\\w/\\-?=%.]+\\.[\\w/\\-&?=%.]+', string)
        urls = re.findall(rx_url, string)

        return urls
    else:
        return 'None'
extract_urls_udf = func.udf(lambda x: extract_urls(x), StringType())

# *************************************************************
def url_count(string):
    '''
    Count all urls in string
    '''
    if string:
        return(len(extract_urls(string)))
    else:
        return 0
url_count_udf = func.udf(lambda x: url_count(x), IntegerType())

# *************************************************************
def extract_url_parts(string):
    '''
    Return url in parts (https://stackoverflow.com/questions/27745/getting-parts-of-a-url-regex)
    '''
    if string:
        return re.findall('^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$', string)
    else:
        return 'None'

def extract_urls_redirect_base(string_1, string_2, string_3):
    '''
    Call extract_url_parts and create a list of hosts from twitters redirect columns
    '''
    try:
        host_list = ['', '', '']
        if string_3:
            url_parts = extract_url_parts(string_3)
            host_list[2] = url_parts[0][2]
        if string_2:
            url_parts = extract_url_parts(string_2)
            host_list[1] = url_parts[0][2]
        if string_1:
            url_parts = extract_url_parts(string_1)
            host_list[0] = url_parts[0][2]
        else:
            return 'None'
    except:
        return 'None'
    return host_list
extract_urls_redirect_base_udf = func.udf(lambda x,y,z: extract_urls_redirect_base(x,y,z), StringType())

# *************************************************************
def word_count(string):
    '''
    Count number of words in string (slightly error prone b/c split on spaces)
    '''
    if string:
        return len(string.split(' '))
    else:
        return 0
word_count_udf = func.udf(lambda x: word_count(x), IntegerType())

# *************************************************************
def character_count(string):
    '''
    Count number of characters in the tweet
    '''
    if string:
        return len(string)
    else: 
        return 0
character_count_udf = func.udf(lambda x: character_count(x), IntegerType())

# *************************************************************
def extract_date_info(string, info_type):
    '''
    IN WORK
    Extract date info
    '''
    date = datetime.datetime.strptime(string, '%m/%d/%Y %H:%M')
    
    if info_type == 'minute':
        info = date.minute
    elif info_type == 'hour':
        info = date.hour
    elif info_type == 'day':
        info = date.day
    elif info_type == 'month':
        info = date.month
    elif info_type == 'year':
        info = date.year    
    return info
extract_date_info_udf = func.udf(lambda x,y: extract_date_info(x,y), IntegerType())

# *************************************************************

def assignLabel(account_category):
    '''
        Assigns 1 - troll, or 0 - not-troll as a label to the tweet.
    '''
    if account_category in ("RightTroll", "LeftTroll" , "Fearmonger"):
        return 1
    else:
        return 0    
# test = assignLabel("Commercial")
# print(test)
assignLabel_udf = func.udf(assignLabel, IntegerType())



In [8]:
df_enriched = df.withColumn("curated_content", convert_emojii_UDF(col("content"))) \
                .withColumn("tco1_step1_domain", extract_domain_information_UDF(col("tco1_step1"))) \
                .withColumn("tco2_step1_domain", extract_domain_information_UDF(col("tco2_step1"))) \
                .withColumn("tco3_step1_domain", extract_domain_information_UDF(col("tco3_step1"))) \
                .withColumn("handles", extract_handles_UDF(col("content"))) \
                .withColumn('emoji_count', count_emoji_udf(col('content'))) \
                .withColumn('emoji_text', extract_emoji_udf(col('content'))) \
                .withColumn('word_count', word_count_udf(col('content'))) \
                .withColumn('char_count', character_count_udf(col('content'))) \
                .withColumn('urls', extract_urls_udf(col('content'))) \
                .withColumn('url_count', url_count_udf(col('content'))) \
                .withColumn('url_hosts', extract_urls_redirect_base_udf(col('tco1_step1'), col('tco2_step1'), col('tco3_step1'))) \
                .withColumn('label',assignLabel_udf(df['account_category']))

# There is one row without content..
df_enriched = df_enriched.dropna(subset="content")
print("Saving File...")
tools.save_df(df_enriched, "russian-troll-tweets-enriched")

Saving File...
/project/ds5559/team1_sp22/data/russian-troll-tweets-enriched
Saved as: /project/ds5559/team1_sp22/data/russian-troll-tweets-enriched


In [9]:
df_test = tools.load_data("russian-troll-tweets-enriched")

Done loading from /project/ds5559/team1_sp22/data//russian-troll-tweets-enriched.


In [None]:
# df_test.select(['content','handles']).show(5, False)
df_test.filter("content like '%LindseyGrahamSC%'" ).select(['content','urls']).show(5, False)
# df.filter("content like '%LindseyGrahamSC%'" ).select(['content']).show(5, False)

In [None]:
df_test.createOrReplaceTempView("tweets")

In [None]:
df_test.createOrReplaceTempView("tweets")
sqlDF = spark.sql("SELECT tco1_step1_domain, count(tco1_step1_domain) FROM tweets GROUP BY tco1_step1_domain ORDER BY COUNT(tco1_step1_domain) DESC")
sqlDF.show()

In [10]:
#b load pyspark modules
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import *  
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.classification import LogisticRegression


In [11]:
tok_content = Tokenizer(inputCol="content", outputCol="words")
remover_content = StopWordsRemover(inputCol="words", outputCol="words_filtered")
htf_content = HashingTF(inputCol="words_filtered", outputCol="content_htf", numFeatures=200)  

va = VectorAssembler(inputCols=["content_htf"], outputCol="features")  
lr = LogisticRegression(labelCol='label', featuresCol='features', maxIter=10, regParam=0.01)

# Fit the pipeline
pipeline = Pipeline(stages=[
                            tok_content
                            ,remover_content
                            ,htf_content                            
                            ,va
                            ,lr])
# model = pipeline.fit(training)
model = pipeline.fit(df_test)

In [None]:
last_bad = split1

In [None]:
last_bad.select(['content', 'external_author_id', 'publish_date']).show(10, False)

In [None]:
df_test.createOrReplaceTempView("tweets")
sqlDF = spark.sql("SELECT content, external_author_id from tweets where content IS NULL")
sqlDF.show()

In [None]:
test = last_bad.dropna(subset="content")
print('*')
test.filter("content is NULL").show(5)

https://csyhuang.github.io/2020/08/01/custom-transformer/