In [13]:
#libraries / dependencies
import glob
from preprocessor import api as tweet_preprocessor

import numpy as np
import math
import random

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.functions import col, udf, to_timestamp, lit, to_timestamp, when, rand
from pyspark.sql.types import IntegerType, LongType, DoubleType, StringType, ArrayType
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, VectorAssembler

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate, Dropout, Activation, BatchNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD, Adam

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import MeanSquaredError, CategoricalCrossentropy

import findspark

findspark.init()
findspark.find()


'C:\\Users\\USER\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\pyspark'

In [14]:
#Dataset location

#Local
bot_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//bot_tweets//'
genuine_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//genuine_tweets//'

#S3
# bot_tweets_dataset_path = 's3://tweet-dataset-small/bot_tweets' #'F://TwitterBotDataset//tweet_dataset_small//bot_tweets//'
# genuine_tweets_dataset_path = 's3://tweet-dataset-small/genuine_tweets' #'F://TwitterBotDataset//tweet_dataset_small//genuine_tweets//'

#turn a line of text into d dimentional vector 
GLOVE_DIR = ""

#all columns
BOT_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
               'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
               'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
               'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
               'num_mentions','created_at','timestamp','crawled_at', 'updated']

GENUINE_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
                   'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
                   'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
                   'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
                   'num_mentions','REMOVE_IT', 'created_at','timestamp','crawled_at', 'updated',]

#feature used for bot detection
COLUMN_NAMES = ['text', 'retweet_count', 'reply_count', 'favorite_count',
                'num_hashtags', 'num_urls', 'num_mentions']


In [15]:
# #configure spark
# conf = SparkConf()
# conf.setMaster("local[8]").setAppName("ml_account_ base_session")
# conf.set("spark.executor.instances", 4)
# conf.set("spark.executor.cores", 4)
# conf.set("spark.driver.memory", 4)
# sc = SparkContext(conf=conf)

# # init spark, configure spark
# spark = SparkSession.builder.getOrCreate()
# spark

spark = SparkSession.builder.appName('preprocessing_dataset_session').getOrCreate()
spark

In [16]:

# read dataset from csv
bot_tweets = spark.read.csv(bot_tweets_dataset_path, header = True, inferSchema = True)
genuine_tweets = spark.read.csv(genuine_tweets_dataset_path, header = True, inferSchema = True)


In [17]:
bot_tweets.head()

Row(id=532627591686275072, text='I Pooh - In silenzio 1968 http://t.co/ahvQxUqTws', source='"<a href=""http://www.facebook.com/twitter"" rel=""nofollow"">Facebook</a>"', user_id='24858289', truncated=None, in_reply_to_status_id='0', in_reply_to_user_id='0', in_reply_to_screen_name=None, retweeted_status_id='0', geo=None, place=None, contributors=None, retweet_count='0', reply_count='0', favorite_count=0, favorited=None, retweeted=None, possibly_sensitive=None, num_hashtags='0', num_urls='1', num_mentions='0', created_at='Wed Nov 12 20:14:48 +0000 2014', timestamp='2014-11-12 21:14:48', crawled_at='2014-11-12 21:44:09', updated='2014-11-12 21:44:09')

In [18]:
genuine_tweets.head()

Row(593932392663912449='593895316719423488', RT @morningJewshow: Speaking about Jews and comedy tonight at Temple Emanu-El in San Francisco. In other words, my High Holidays.="This age/face recognition thing..no reason platforms can't have changing avatars of our actual faces to increase affect/better communication", <a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>='<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 678033='678033', _c4=None, 05='0', 06='0', _c7=None, 593932168524533760='0', \N9='\\N', \N10='\\N', _c11=None, _c12=None, 113='0', 014='0', 015='0', _c16=None, _c17=None, \N18='\\N', 019='0', 020='0', 121='0', Fri May 01 00:18:11 +0000 2015='Thu Apr 30 21:50:52 +0000 2015', 2015-05-01 02:18:11='2015-04-30 23:50:52', 2015-05-01 12:57:1924='2015-05-01 12:57:19', 2015-05-01 12:57:1925='2015-05-01 12:57:19')

In [19]:
bot_tweets.columns

['id',
 'text',
 'source',
 'user_id',
 'truncated',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'in_reply_to_screen_name',
 'retweeted_status_id',
 'geo',
 'place',
 'contributors',
 'retweet_count',
 'reply_count',
 'favorite_count',
 'favorited',
 'retweeted',
 'possibly_sensitive',
 'num_hashtags',
 'num_urls',
 'num_mentions',
 'created_at',
 'timestamp',
 'crawled_at',
 'updated']

In [21]:
genuine_tweets.columns

['593932392663912449',
 'RT @morningJewshow: Speaking about Jews and comedy tonight at Temple Emanu-El in San Francisco. In other words, my High Holidays.',
 '<a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>',
 '678033',
 '_c4',
 '05',
 '06',
 '_c7',
 '593932168524533760',
 '\\N9',
 '\\N10',
 '_c11',
 '_c12',
 '113',
 '014',
 '015',
 '_c16',
 '_c17',
 '\\N18',
 '019',
 '020',
 '121',
 'Fri May 01 00:18:11 +0000 2015',
 '2015-05-01 02:18:11',
 '2015-05-01 12:57:1924',
 '2015-05-01 12:57:1925']

In [22]:
len(bot_tweets.columns), len(genuine_tweets.columns)

(25, 26)

In [27]:
genuine_tweets.show(5)

+------------------+---------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+------+----+----+----+----+------------------+----+----+----+----+----+----+----+----+----+----+----+----+--------------------+------------------------------+-------------------+---------------------+---------------------+
|593932392663912449|RT @morningJewshow: Speaking about Jews and comedy tonight at Temple Emanu-El in San Francisco. In other words, my High Holidays.|<a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>|678033| _c4|  05|  06| _c7|593932168524533760| \N9|\N10|_c11|_c12| 113| 014| 015|_c16|_c17|\N18| 019| 020|                 121|Fri May 01 00:18:11 +0000 2015|2015-05-01 02:18:11|2015-05-01 12:57:1924|2015-05-01 12:57:1925|
+------------------+--------------------------------------------------------------------------------------

In [28]:
bot_tweets.show(10)

+------------------+--------------------+--------------------+--------+---------+---------------------+-------------------+-----------------------+-------------------+----+-----+------------+-------------+-----------+--------------+---------+---------+------------------+------------+--------+------------+--------------------+-------------------+-------------------+-------------------+
|                id|                text|              source| user_id|truncated|in_reply_to_status_id|in_reply_to_user_id|in_reply_to_screen_name|retweeted_status_id| geo|place|contributors|retweet_count|reply_count|favorite_count|favorited|retweeted|possibly_sensitive|num_hashtags|num_urls|num_mentions|          created_at|          timestamp|         crawled_at|            updated|
+------------------+--------------------+--------------------+--------+---------+---------------------+-------------------+-----------------------+-------------------+----+-----+------------+-------------+-----------+-------

In [29]:
bot_tweets.count()

3457133

In [30]:
genuine_tweets.count()

9401766