In [1]:
#libraries / dependencies
import numpy as np
import math
import random

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.functions import col, udf, to_timestamp, lit, to_timestamp, when, rand
from pyspark.sql.types import IntegerType, LongType, DoubleType, StringType, ArrayType
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, VectorAssembler

In [2]:
import findspark

findspark.init()
findspark.find()

'C:\\Users\\USER\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\pyspark'

In [3]:

#Dataset location

#Local
bot_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//bot_tweets//'
genuine_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//genuine_tweets//'
#turn a line of text into d dimentional vector 
GLOVE_DIR = ""

#all columns
BOT_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
               'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
               'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
               'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
               'num_mentions','created_at','timestamp','crawled_at', 'updated']

GENUINE_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
                   'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
                   'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
                   'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
                   'num_mentions','REMOVE_IT', 'created_at','timestamp','crawled_at', 'updated',]

#feature used for bot detection
COLUMN_NAMES = ['text', 'retweet_count', 'reply_count', 'favorite_count',
                'num_hashtags', 'num_urls', 'num_mentions']

In [4]:
#local multi thread spark thread
conf = SparkConf()
conf.setMaster("local[10]").setAppName("split_dataset_test_session")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark

In [5]:
# read dataset from csv
def read_dataset():
    bot_tweets = spark.read.csv(bot_tweets_dataset_path, header = True, inferSchema = True).limit(100)
    genuine_tweets = spark.read.csv(genuine_tweets_dataset_path, header = True, inferSchema = True).limit(100)
    
#     print(len(bot_tweets.collect()), len(genuine_tweets.collect()))
    return bot_tweets, genuine_tweets

df_bot, df_gen = read_dataset()

In [6]:
#show dataframe schema
df_bot.printSchema(), df_gen.printSchema()

root
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- source: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- truncated: string (nullable = true)
 |-- in_reply_to_status_id: string (nullable = true)
 |-- in_reply_to_user_id: string (nullable = true)
 |-- in_reply_to_screen_name: string (nullable = true)
 |-- retweeted_status_id: string (nullable = true)
 |-- geo: string (nullable = true)
 |-- place: string (nullable = true)
 |-- contributors: string (nullable = true)
 |-- retweet_count: string (nullable = true)
 |-- reply_count: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- favorited: integer (nullable = true)
 |-- retweeted: string (nullable = true)
 |-- possibly_sensitive: string (nullable = true)
 |-- num_hashtags: string (nullable = true)
 |-- num_urls: string (nullable = true)
 |-- num_mentions: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- cr

(None, None)

In [7]:
df_bot.head(2)

[Row(id=532627591686275072, text='I Pooh - In silenzio 1968 http://t.co/ahvQxUqTws', source='"<a href=""http://www.facebook.com/twitter"" rel=""nofollow"">Facebook</a>"', user_id='24858289', truncated=None, in_reply_to_status_id='0', in_reply_to_user_id='0', in_reply_to_screen_name=None, retweeted_status_id='0', geo=None, place=None, contributors=None, retweet_count='0', reply_count='0', favorite_count=0, favorited=None, retweeted=None, possibly_sensitive=None, num_hashtags='0', num_urls='1', num_mentions='0', created_at='Wed Nov 12 20:14:48 +0000 2014', timestamp='2014-11-12 21:14:48', crawled_at='2014-11-12 21:44:09', updated='2014-11-12 21:44:09'),
 Row(id=532624255058706432, text='http://t.co/HyI5EQKz6Q', source='"<a href=""http://www.facebook.com/twitter"" rel=""nofollow"">Facebook</a>"', user_id='24858289', truncated=None, in_reply_to_status_id='0', in_reply_to_user_id='0', in_reply_to_screen_name=None, retweeted_status_id='0', geo=None, place=None, contributors=None, retweet_cou

In [8]:
df_gen.head(2)

[Row(593932392663912449='593895316719423488', RT @morningJewshow: Speaking about Jews and comedy tonight at Temple Emanu-El in San Francisco. In other words, my High Holidays.="This age/face recognition thing..no reason platforms can't have changing avatars of our actual faces to increase affect/better communication", <a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>='<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 678033='678033', _c4=None, 05='0', 06='0', _c7=None, 593932168524533760='0', \N9='\\N', \N10='\\N', _c11=None, _c12=None, 113='0', 014='0', 015='0', _c16=None, _c17=None, \N18='\\N', 019='0', 020='0', 121='0', Fri May 01 00:18:11 +0000 2015='Thu Apr 30 21:50:52 +0000 2015', 2015-05-01 02:18:11='2015-04-30 23:50:52', 2015-05-01 12:57:1924='2015-05-01 12:57:19', 2015-05-01 12:57:1925='2015-05-01 12:57:19'),
 Row(593932392663912449='593880638069018624', RT @morningJewshow: Speaking about Jews and comedy tonight at Temple Emanu-El in 

In [129]:
num_of_thread = 5
split_weights = [1.0] * num_of_thread

In [130]:
split_weights

[1.0, 1.0, 1.0, 1.0, 1.0]

In [131]:
df_bot_splits = df_bot.randomSplit(split_weights, 27)

In [132]:
counts = [df.count() for df in df_bot_splits]

In [133]:
counts

[13, 26, 22, 18, 21]

In [128]:
sum(counts)

100

In [82]:
data_list = [(df_bot_splits[i], i) for i in range(num_of_thread)]

In [86]:
len(data_list)
data_list[1][0].count()

18

In [106]:
def worker_job(args):
    #do data processing
    #do model training
    print("{}: Total examples: {}".format(args[1], args[0].count()))
    df = args[0]
    for row in df.collect():
        print("{}: {}".format(args[1], row[0]))
    return args[0].count()
    

In [107]:
#multitasking using ThreadPool
from multiprocessing.pool import ThreadPool
pool = ThreadPool(num_of_thread)
res = pool.map(worker_job, data_list)

1: Total examples: 180: Total examples: 26
3: Total examples: 19
2: Total examples: 15

3: 522845834543501313
3: 523202287414833152
3: 523930571656151040
3: 524624746663903232
3: 525000669225582592
3: 525007738036690945
3: 525027778530672640
3: 525339479234859008
3: 525340085638942720
3: 525342127560359936
3: 525353592631599104
3: 525953529824165888
3: 527073951315689473
3: 527183738292416513
3: 528260714885312512
3: 528839797335687171
3: 530618651008069632
3: 531892384221908992
3: 532627591686275072
0: 522455698064818176
0: 523523182859653120
0: 523549266804232193
0: 523558799995838465
0: 523889574205194241
0: 524286576369475584
0: 525154798350663680
0: 525342561071013888
0: 525874356702826496
0: 525986615265468416
0: 527184078232367104
0: 527184698569932802
0: 528611416874708994
0: 528860735347384320
0: 529151831256338432
0: 529358076189896706
0: 529970276189372416
0: 530422704844079104
0: 530455234251091969
0: 530456918553526274
0: 530692157708197888
0: 530818316462718980
0: 5309663

In [108]:
res

[26, 18, 15, 19, 22]

In [109]:
from threading import Thread
for idx in range(num_of_thread):
    thread = Thread(target=worker_job, kwargs={'args' : data_list[idx]})
    thread.start()
    thread.join()

0: Total examples: 26
0: 522455698064818176
0: 523523182859653120
0: 523549266804232193
0: 523558799995838465
0: 523889574205194241
0: 524286576369475584
0: 525154798350663680
0: 525342561071013888
0: 525874356702826496
0: 525986615265468416
0: 527184078232367104
0: 527184698569932802
0: 528611416874708994
0: 528860735347384320
0: 529151831256338432
0: 529358076189896706
0: 529970276189372416
0: 530422704844079104
0: 530455234251091969
0: 530456918553526274
0: 530692157708197888
0: 530818316462718980
0: 530966309065195520
0: 531204588414590976
0: 532297646669852672
0: 532624255058706432
1: Total examples: 18
1: 523530536783183875
1: 523530706149187585
1: 524424861427728385
1: 524631813957115904
1: 525737191281262592
1: 526106221494140928
1: 526296216414720000
1: 526303221007265792
1: 527508978490945536
1: 527948171138580480
1: 528446839230849024
1: 528842393525960704
1: 528864699891216386
1: 529354671081914368
1: 529358614876946432
1: 531201844119625728
1: 531205051705212928
1: 5315646

In [111]:
from pyspark import InheritableThread

for idx in range(num_of_thread):
    thread = InheritableThread(target = worker_job, kwargs={'args':data_list[idx]})
    thread.start()
    thread.join()
    

0: Total examples: 26
0: 522455698064818176
0: 523523182859653120
0: 523549266804232193
0: 523558799995838465
0: 523889574205194241
0: 524286576369475584
0: 525154798350663680
0: 525342561071013888
0: 525874356702826496
0: 525986615265468416
0: 527184078232367104
0: 527184698569932802
0: 528611416874708994
0: 528860735347384320
0: 529151831256338432
0: 529358076189896706
0: 529970276189372416
0: 530422704844079104
0: 530455234251091969
0: 530456918553526274
0: 530692157708197888
0: 530818316462718980
0: 530966309065195520
0: 531204588414590976
0: 532297646669852672
0: 532624255058706432
1: Total examples: 18
1: 523530536783183875
1: 523530706149187585
1: 524424861427728385
1: 524631813957115904
1: 525737191281262592
1: 526106221494140928
1: 526296216414720000
1: 526303221007265792
1: 527508978490945536
1: 527948171138580480
1: 528446839230849024
1: 528842393525960704
1: 528864699891216386
1: 529354671081914368
1: 529358614876946432
1: 531201844119625728
1: 531205051705212928
1: 5315646

In [None]:
spark.stop()