In [1]:
#libraries / dependencies
import numpy as np
import math
import random

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.functions import col, udf, to_timestamp, lit, to_timestamp, when, rand
from pyspark.sql.types import IntegerType, LongType, DoubleType, StringType, ArrayType
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, VectorAssembler

In [2]:
import findspark

findspark.init()
findspark.find()

'C:\\Users\\USER\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\pyspark'

In [3]:

#Dataset location

#Local
bot_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//bot_tweets//'
genuine_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//genuine_tweets//'
#turn a line of text into d dimentional vector 
GLOVE_DIR = ""

#all columns
BOT_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
               'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
               'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
               'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
               'num_mentions','created_at','timestamp','crawled_at', 'updated']

GENUINE_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
                   'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
                   'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
                   'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
                   'num_mentions','REMOVE_IT', 'created_at','timestamp','crawled_at', 'updated',]

#feature used for bot detection
COLUMN_NAMES = ['text', 'retweet_count', 'reply_count', 'favorite_count',
                'num_hashtags', 'num_urls', 'num_mentions']

In [4]:
#local multi thread spark thread
conf = SparkConf()
conf.setMaster("local[10]").setAppName("split_dataset_test_session")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark

In [6]:
# read dataset from csv
def read_dataset():
    bot_tweets = spark.read.csv(bot_tweets_dataset_path, header = True, inferSchema = True).limit(100)
    genuine_tweets = spark.read.csv(genuine_tweets_dataset_path, header = True, inferSchema = True).limit(100)
    
#     print(len(bot_tweets.collect()), len(genuine_tweets.collect()))
    return bot_tweets, genuine_tweets

df_bot, df_gen = read_dataset()

In [12]:
#show dataframe schema
df_bot.printSchema(), df_gen.printSchema()

root
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- source: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- truncated: string (nullable = true)
 |-- in_reply_to_status_id: string (nullable = true)
 |-- in_reply_to_user_id: string (nullable = true)
 |-- in_reply_to_screen_name: string (nullable = true)
 |-- retweeted_status_id: string (nullable = true)
 |-- geo: string (nullable = true)
 |-- place: string (nullable = true)
 |-- contributors: string (nullable = true)
 |-- retweet_count: string (nullable = true)
 |-- reply_count: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- favorited: integer (nullable = true)
 |-- retweeted: string (nullable = true)
 |-- possibly_sensitive: string (nullable = true)
 |-- num_hashtags: string (nullable = true)
 |-- num_urls: string (nullable = true)
 |-- num_mentions: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- cr

(None, None)

In [14]:
df_bot.head(2)

[Row(id=532627591686275072, text='I Pooh - In silenzio 1968 http://t.co/ahvQxUqTws', source='"<a href=""http://www.facebook.com/twitter"" rel=""nofollow"">Facebook</a>"', user_id='24858289', truncated=None, in_reply_to_status_id='0', in_reply_to_user_id='0', in_reply_to_screen_name=None, retweeted_status_id='0', geo=None, place=None, contributors=None, retweet_count='0', reply_count='0', favorite_count=0, favorited=None, retweeted=None, possibly_sensitive=None, num_hashtags='0', num_urls='1', num_mentions='0', created_at='Wed Nov 12 20:14:48 +0000 2014', timestamp='2014-11-12 21:14:48', crawled_at='2014-11-12 21:44:09', updated='2014-11-12 21:44:09'),
 Row(id=532624255058706432, text='http://t.co/HyI5EQKz6Q', source='"<a href=""http://www.facebook.com/twitter"" rel=""nofollow"">Facebook</a>"', user_id='24858289', truncated=None, in_reply_to_status_id='0', in_reply_to_user_id='0', in_reply_to_screen_name=None, retweeted_status_id='0', geo=None, place=None, contributors=None, retweet_cou

In [15]:
df_gen.head(2)

[Row(593932392663912449='593895316719423488', RT @morningJewshow: Speaking about Jews and comedy tonight at Temple Emanu-El in San Francisco. In other words, my High Holidays.="This age/face recognition thing..no reason platforms can't have changing avatars of our actual faces to increase affect/better communication", <a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>='<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 678033='678033', _c4=None, 05='0', 06='0', _c7=None, 593932168524533760='0', \N9='\\N', \N10='\\N', _c11=None, _c12=None, 113='0', 014='0', 015='0', _c16=None, _c17=None, \N18='\\N', 019='0', 020='0', 121='0', Fri May 01 00:18:11 +0000 2015='Thu Apr 30 21:50:52 +0000 2015', 2015-05-01 02:18:11='2015-04-30 23:50:52', 2015-05-01 12:57:1924='2015-05-01 12:57:19', 2015-05-01 12:57:1925='2015-05-01 12:57:19'),
 Row(593932392663912449='593880638069018624', RT @morningJewshow: Speaking about Jews and comedy tonight at Temple Emanu-El in 

In [33]:
split_weights = [0.2] * 5

In [34]:
split_weights

[0.2, 0.2, 0.2, 0.2, 0.2]

In [35]:
df_bot_splits = df_bot.randomSplit(split_weights)

In [36]:
counts = [df.count() for df in df_bot_splits]

In [37]:
counts

[21, 28, 18, 13, 20]

In [28]:
sum(counts)

100

In [29]:
df_bot_splits[0].show(5)

+------------------+--------------------+--------------------+--------+---------+---------------------+-------------------+-----------------------+-------------------+----+-----+------------+-------------+-----------+--------------+---------+---------+------------------+------------+--------+------------+--------------------+-------------------+-------------------+-------------------+
|                id|                text|              source| user_id|truncated|in_reply_to_status_id|in_reply_to_user_id|in_reply_to_screen_name|retweeted_status_id| geo|place|contributors|retweet_count|reply_count|favorite_count|favorited|retweeted|possibly_sensitive|num_hashtags|num_urls|num_mentions|          created_at|          timestamp|         crawled_at|            updated|
+------------------+--------------------+--------------------+--------+---------+---------------------+-------------------+-----------------------+-------------------+----+-----+------------+-------------+-----------+-------

In [38]:
# df_subs = np.array_split(df_bot, 3)