In [45]:
#libraries / dependencies
import glob
from preprocessor import api as tweet_preprocessor

import numpy as np
import math
import random

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.functions import col, udf, to_timestamp, lit, to_timestamp, when, rand
from pyspark.sql.types import IntegerType, LongType, DoubleType, StringType, ArrayType
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, VectorAssembler

from pyspark import StorageLevel
from pyspark.accumulators import AccumulatorParam

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate, Dropout, Activation, BatchNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD, Adam

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import MeanSquaredError, CategoricalCrossentropy


#thread depencency
from pyspark import InheritableThread

import findspark

findspark.init()
findspark.find()



# initialize the number of epochs to train for, batch size, and
# initial learning rate
EPOCHS = 25
BS = 64
INIT_LR = 1e-3

mse = MeanSquaredError()
cce = CategoricalCrossentropy()
opt = Adam(learning_rate=INIT_LR)



#Dataset location

#Local
bot_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//bot_tweets//'
genuine_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_full//genuine_tweets//'

#S3
# bot_tweets_dataset_path = 's3://tweet-dataset/bot_tweets' #'F://TwitterBotDataset//tweet_dataset_small//bot_tweets//'
# genuine_tweets_dataset_path = 's3://tweet-dataset/genuine_tweets' #'F://TwitterBotDataset//tweet_dataset_small//genuine_tweets//'

#turn a line of text into d dimentional vector 
GLOVE_DIR = ""
grad_fname = 'grads.npy'

#all columns
BOT_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
               'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
               'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
               'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
               'num_mentions','created_at','timestamp','crawled_at', 'updated']

GENUINE_COLUMNS = ['id','text','source','user_id','truncated','in_reply_to_status_id', 
                   'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
                   'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
                   'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
                   'num_mentions','REMOVE_IT', 'created_at','timestamp','crawled_at', 'updated',]

#feature used for bot detection
COLUMN_NAMES = ['text', 'retweet_count', 'reply_count', 'favorite_count',
                'num_hashtags', 'num_urls', 'num_mentions']


# #configure spark
# conf = SparkConf()
# conf.setMaster("local[8]").setAppName("ml_account_ base_session")
# conf.set("spark.executor.instances", 4)
# conf.set("spark.executor.cores", 4)
# conf.set("spark.driver.memory", 4)
# sc = SparkContext(conf=conf)

# # for spark-submit
# spark = SparkSession.builder.appName('ml_account_ base_session').getOrCreate()
# spark

# for local build
# spark = SparkSession.builder.appName('ml_account_ base_session').getOrCreate()


#for local multi thread
conf = SparkConf()
conf.setMaster("local[10]").setAppName("distributed_training_session")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# read dataset from csv
def read_dataset():
    bot_tweets = spark.read.csv(bot_tweets_dataset_path, header = True, inferSchema = True).limit(1000)
    genuine_tweets = spark.read.csv(genuine_tweets_dataset_path, header = True, inferSchema = True).limit(1000)
    
    #df = inputDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
#     bot_tweets = bot_tweets.persist(StorageLevel.MEMORY_ONLY)
#     genuine_tweets = genuine_tweets.persist(StorageLevel.MEMORY_ONLY)
    
#     print(len(bot_tweets.collect()), len(genuine_tweets.collect()))
    return bot_tweets, genuine_tweets

def set_column_name(df, column_names):
    df = df.toDF(*column_names)
#     print(len(df.collect()))
    return df

def remove_column_miss_match(df):
    ## dataset have diffrent number of columns
    ## column name of dataframe
    column_name = [cname for cname, tp in df.dtypes]
#     len(df.collect()), len(df.dtypes)
    #column_name

    #Number of column is diffrent for bot and genuine tweets data

    #genuine_tweets_df = genuine_tweets_df.toDF(*column_name)
    df = set_column_name(df, GENUINE_COLUMNS)
#     print(len(df.collect()))
    
    df = df.drop('REMOVE_IT') # remove 5th column from end
    #update column name according to 
    df = set_column_name(df, BOT_COLUMNS)
#     print(len(df.collect()))
    return df


def remove_type_miss_match(df):
    # Same column has diffrent data type. So make data type same for every column
    genuine_tweets_df = df.withColumn("id",col("id").cast(IntegerType())) \
                                    .withColumn("favorite_count",col("favorite_count").cast(LongType())) \
                                    .withColumn("favorited",col("favorited").cast(IntegerType()))
    return df


def resize_combine_data(bot_tweets_df, genuine_tweets_df):
    ## only keep the required column from the dataframe
    bot_tweets_df = bot_tweets_df.select(*COLUMN_NAMES)
    genuine_tweets_df = genuine_tweets_df.select(*COLUMN_NAMES)
    
#     print(len(bot_tweets_df.collect()), len(genuine_tweets_df.collect()))

    ## add BotOrNot column
    bot_tweets_df = bot_tweets_df.withColumn('BotOrNot', lit(1))
    genuine_tweets_df = genuine_tweets_df.withColumn('BotOrNot', lit(0))

    #combine clean and bot accounts data togather
    tweets_df = bot_tweets_df.union(genuine_tweets_df)

    # shuffle dataset
    tweets_df = tweets_df.orderBy(rand())

#     print(len(tweets_df.collect()))
    
    return tweets_df

text_process_udf = udf(lambda x : tweet_preprocessor.tokenize(x), StringType())
def preprocess_data(df):
    df = df.withColumn('text', text_process_udf(df.text))
    df = df.withColumn("retweet_count",col("retweet_count").cast(DoubleType()))
    df = df.withColumn("reply_count",col("reply_count").cast(DoubleType()))
    df = df.withColumn("favorite_count",col("favorite_count").cast(DoubleType()))
    df = df.withColumn("num_hashtags",col("num_hashtags").cast(DoubleType()))
    df = df.withColumn("num_urls",col("num_urls").cast(DoubleType()))
    df = df.withColumn("num_mentions",col("num_mentions").cast(DoubleType()))
    
    return df


def makeGloveWordEmbedder(glove_path):
    embedding_dict = {}
    with open(glove_path, 'r', encoding="utf-8") as glove_file:
        for line in glove_file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embedding_dict[word] = vector
            
    return embedding_dict  


# # Test GLoVE result
# glove_word2vec_embedder["google"]

 ##create word embedding GLoVE model dictionary. Use pre trained model
text_feature_dimention = 25
# glove_word2vec_embedder = makeGloveWordEmbedder(GLOVE_DIR + "glove.twitter.27B.25d.txt")


#Give a word and get that word representing feature vector of dimention 25 from embedding dictionary
def word2vec(word_dict=None, word=None, dim=25):
    default_vector = np.zeros(dim)
    
    if word_dict is None or word is None:
        return default_vector
    
    word_vector = word_dict.get(word)
    
    if word_vector is None:
        return default_vector
    return word_vector

# # test a word representing feature vector
# word_vector = word2vec(glove_word2vec_embedder, "tweet", text_feature_dimention)
# print(type(word_vector), word_vector)


#----------------LSTM Model----------------------

#create a 1 layer LSTM model
#input dimention: 3d (1,7,25) input_sample = [[[1,...,25]],..,[1,...,25]]
#output dimention: 1d (1,32) output_sample [[1,2,3,,,,32]]
def lstm_model(output_dim):
    model = LSTM(output_dim, return_sequences=False, return_state=False)
    return model

def reset_lstm_model(model):
    model.resate_states() # stateful=True is required for reset states

# create LSTM model of output dimention 32. Model output feature vector will be of 32 dimention vector
lstm = lstm_model(32) 

# convver a sentence to a feature vector using LSTM(RNN) model
def sent2vec(sent):
    words = sent.split(' ')
    word_vectors = np.array([])
    count = 0;
    for word in words:
        word_vector = word2vec(broadcast_glove_dict.value, word)
#         print("word dim: {}".format(len(word_vector)))
        if word_vectors.size == 0:
            word_vectors = np.array([word_vector])
        else:
            word_vectors = np.vstack([word_vectors, word_vector])
        count = count + 1
    
#     print("Input feature vector shape before reshape(2D): {}".format(word_vectors.shape))
        
    input_feature_vectors = np.reshape(word_vectors, (1, count, text_feature_dimention))
#     print("Input feature vector shape after reshape(3d): {}".format(input_feature_vectors.shape))
#     print("LSTM requirs 3d shape inputs [batch, timesteps, feature]")
    output_vector = lstm(input_feature_vectors)
#     lstm.reset_states() # stateful = True is required for reset

#     print("result vector shape: {}".format(output_vector.shape))
#     print("Last input was: {}".format(input_feature_vectors[0][-1]))
#     print("output result: {}".format(output_vector))
    
    # (tensore --> numpy 0bject --> numpy.array --> array/list/ArrayType)
    return output_vector.numpy()[0].tolist() 
    
## For Testing sentence to vector convertion
# sent = "Twitter is a large social media network"
# res_vector = sent2vec(sent)
# type(res_vector), res_vector


# text string --> vector 32 dimention
sent_to_vector_udf = udf(lambda x : sent2vec(x), ArrayType(DoubleType()))
def processTextColumn(df, column_name, new_column_name):
    df = df.withColumn(new_column_name, sent_to_vector_udf(col(column_name)))
    return df

def sentEmbeddingGLoVE_LSTM(df):
    
    text_updated_column = 'text_features'
    updated_df = processTextColumn(df, "text", text_updated_column)

#     print(len(updated_df.collect()), type(updated_df), updated_df.printSchema()) 
    
    return updated_df



def assembleColumns(tweets_df):
    columns = ['retweet_count', 'reply_count', 'favorite_count',
               'num_hashtags' ,'num_urls', 'num_mentions', 'BotOrNot']

    tweets_df = tweets_df.select(*columns, 
                          tweets_df.text_features[0], tweets_df.text_features[1], tweets_df.text_features[2],tweets_df.text_features[3], tweets_df.text_features[4],
                          tweets_df.text_features[5], tweets_df.text_features[6], tweets_df.text_features[7],tweets_df.text_features[8], tweets_df.text_features[9], 
                          tweets_df.text_features[10], tweets_df.text_features[11], tweets_df.text_features[12],tweets_df.text_features[13], tweets_df.text_features[14],
                          tweets_df.text_features[15], tweets_df.text_features[16], tweets_df.text_features[17],tweets_df.text_features[18], tweets_df.text_features[19],
                          tweets_df.text_features[20], tweets_df.text_features[21], tweets_df.text_features[22],tweets_df.text_features[23], tweets_df.text_features[24],
                          tweets_df.text_features[25], tweets_df.text_features[26], tweets_df.text_features[27],tweets_df.text_features[28], tweets_df.text_features[29],
                          tweets_df.text_features[30], tweets_df.text_features[31])


#     print(tweets_df.columns, len(tweets_df.collect()), tweets_df.printSchema())

    #remove 

    feature_columns = ['retweet_count','reply_count','favorite_count','num_hashtags','num_urls','num_mentions',
                       'text_features[0]','text_features[1]', 'text_features[2]','text_features[3]','text_features[4]',
                       'text_features[5]','text_features[6]','text_features[7]', 'text_features[8]','text_features[9]',
                       'text_features[10]','text_features[11]','text_features[12]','text_features[13]','text_features[14]',
                       'text_features[15]','text_features[16]','text_features[17]','text_features[18]','text_features[19]',
                       'text_features[20]','text_features[21]','text_features[22]', 'text_features[23]', 'text_features[24]',
                       'text_features[25]','text_features[26]','text_features[27]', 'text_features[28]', 'text_features[29]',
                       'text_features[30]','text_features[31]']


    tweets_df = tweets_df.na.fill(value=0.0 ,subset= feature_columns)
    feature_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'independent_features')

    tweets_updated_df = feature_assembler.transform(tweets_df)

    #check
#     num = len(tweets_updated_df.collect())
#     print(num, type(tweets_updated_df), tweets_updated_df.printSchema())

    #remove unnecessary columns
    tweets_updated_df = tweets_updated_df.drop(*feature_columns)
    
    return tweets_updated_df


def to_nparray_list(df, column_name):
    rows = df.select(column_name).collect()
    lists = [x[column_name] for x in rows]
    nparr = np.array(lists)
    
    return nparr

def to_nparray_dataset(df, feature_column, target_column):
#     list(df.select('col_name').toPandas()['col_name']) 
#     feature = list(df.select(feature_column).toPandas()[feature_column])
#     target = list(df.select(target_column).toPandas()[target_column])
    feature = [row[0] for row in list(df.select(feature_column).toLocalIterator())]
    target = [row[0] for row in list(df.select(target_column).toLocalIterator())]
        
    return np.array(feature), np.array(target)    

def partition_dataset(df):
    train_df, test_df = df.randomSplit([0.80, 0.20])
#     print(len(train_df.collect()), len(test_df.collect()))

    # features --> 'BotOrNot'
#     X_train = train_df.drop('BotOrNot')
#     y_train = train_df.select('BotOrNot')
#     X_test = test_df.drop('BotOrNot')
#     y_test = test_df.select('BotOrNot')
    

    #checkpoint
#     print(len(X_train.collect()), len(y_train.collect()))
#     print(len(X_test.collect()), len(y_test.collect()))

#     X_train = to_nparray_list(X_train, 'independent_features')
#     y_train = to_nparray_list(y_train, 'BotOrNot')
#     X_test = to_nparray_list(X_test, 'independent_features')
#     y_test = to_nparray_list(y_test, 'BotOrNot')
    train_df = train_df.cache()
    test_df = test_df.cache()

    train_X,  train_Y = to_nparray_dataset(train_df, 'independent_features', 'BotOrNot')
    test_X, test_Y = to_nparray_dataset(test_df, 'independent_features', 'BotOrNot')

    
    return train_X, train_Y, test_X, test_Y # return type: numpy.array


def getTrainTestData(df, seed = 21):
    train_X, test_X = df.randomSplit([0.7, 0.3], seed)
    return train_X, test_X
    

'''
def distributedTrainingGradients(df, feature_column, target_column, n_splits):
    print(df.count())
    each_len = df.count() // n_splits
    gradients = []
    ##split dataset into 'n_splits' part
    copy_df = df
    for i in range(n_splits):
        print(i)
        temp_df = copy_df.limit(each_len)
        copy_df = copy_df.subtract(temp_df)
        
        X = temp_df.select(feature_column)
        Y = temp_df.select(target_column)
        X_np = to_nparray_list(X, feature_column)
        Y_np = to_nparray_list(Y, target_column)
        
        grad = step(X_np, Y_np)
        gradients.append(grad)
        print(temp_df.count())
        
    return gradients
'''

def generateGradient(X, Y, bw0, grads):
    gd = step(X, Y, bw0, grads)
    return gd

def step(X, Y, bw0, grads):
    print("Input count: {}, {}".format(len(X), len(Y)))
    #keep track of gradients
    
    
    
    curr_model = get2DenseLayeredModel(38)
    #apply previous training gradient
    if bw0 is not None:
        curr_model.set_weights(bw0)
    
    if grads is not None:
        opt.apply_gradients(zip(grads, curr_model.trainable_variables))
    
    # gradienttape autometically watch trainable_variable
    # curr_model.trainable_variables
    # no need for tape.watch(curr_model.trainable_variables)
    
    with tf.GradientTape() as tape:    
        #make a prediction using model
        predict = curr_model(X)
        #calculate loss
        loss = mse(Y, predict)
    #calculate the gradient
    gd = tape.gradient(loss, curr_model.trainable_variables)
    
    # return the gradient to train final model
    return gd



'''
def stepEPOCH(X, y):
    with tf.GradientTape() as tape:
        curr_model = getDLModel()
        for i in range(EPOCHS):
            #make a prediction using model
            predict = curr_model(X)
            #calculate loss
            loss = cce(y, predict)
            print("{}: {}".format(i, loss))
            opt.apply_gradients(zip(grad, curr_model.trainable_variables))
            
    gradient = tape.gradient(loss, curr_model.trainable_variables)
    
    return gradient

'''
    

## create model
def get2DenseLayeredModel(input_dim):
    model = Sequential()
    model.add(Dense(500, input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(200))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

def model_evaluation(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train,
              batch_size=64,
              epochs=10,
              validation_data=(X_test, y_test))
    score, acc = model.evaluate(X_test, y_test, verbose=0)
    print('Test accuracy:', acc)

    
def removeExtraColumn(df, column_names):
    if len(df.columns) == 26:
        df = remove_column_miss_match(df)
    else:
        df = set_column_name(df, column_names)
    
    return df
    
def worker_task_eval(bot_tweets_df, genuine_tweets_df):
   #solve column number issue
    bot_tweets_df = removeExtraColumn(bot_tweets_df, BOT_COLUMNS)
    genuine_tweets_df = removeExtraColumn(genuine_tweets_df, BOT_COLUMNS)
    
    
    bot_tweets_df = remove_type_miss_match(bot_tweets_df)
    genuine_tweets_df = remove_type_miss_match(genuine_tweets_df)
    
#     print(len(bot_tweets_df.collect()), len(genuine_tweets_df.collect()))
    
    ##preprocess data
    tweets_df = resize_combine_data(bot_tweets_df, genuine_tweets_df)
    tweets_df = preprocess_data(tweets_df)
    
#     print(len(tweets_df.collect()))
#     print(tweets_df.columns)
    
    ##text embedding using GLoVE & LSTM
    ## Word Embedding
    tweets_df = sentEmbeddingGLoVE_LSTM(tweets_df)
    
    ## Assable multiple colu,ms to create feature vector
    tweets_updated_df = assembleColumns(tweets_df)
#     print(len(tweets_updated_df.collect()), tweets_updated_df.columns)

#     tweets_updated_df = tweets_updated_df.cache()
    
    X_test, Y_test = to_nparray_dataset(tweets_updated_df, 'independent_features', 'BotOrNot')
    
    model = get2DenseLayeredModel(38)
    
    bw0 = broadcast_w0.value
    grads = np.load(grad_fname)
    grads = tf.convert_to_tensor(grads, dtype='float32')
    
    opt.apply_gradients(zip(grads, model.trainable_variables))
        
        
    # in order to calculate accuracy using Keras' functions we first need
    # to compile the model
    model.compile(optimizer= opt, loss=cce, metrics=["acc"])
    
    
    # now that the model is compiled we can compute the accuracy
    (loss, acc) = model.evaluate(X_test, Y_test)
    print("[INFO] test accuracy: {}".format(acc))
    print("[INFO] test loss: {}".format(loss))
    
    

def worker_task(bot_tweets_df, genuine_tweets_df):
#     #cache df
    bot_tweets_df = bot_tweets_df.cache()
    genuine_tweets_df = genuine_tweets_df.cache()
    
    print("#bot_tweets: {} #gen_tweets: {}".format(bot_tweets_df.count(), genuine_tweets_df.count()))
    
    ##clean data / remove unwanted column
#     if len(bot_tweets_df.columns) == 26:
#         bot_tweets_df = remove_column_miss_match(bot_tweets_df)
#     else:
#         bot_tweets_df = set_column_name(bot_tweets_df, BOT_COLUMNS)
        
#     if len(genuine_tweets_df.columns) == 26:
#         genuine_tweets_df = remove_column_miss_match(genuine_tweets_df)
#     else:
#         genuine_tweets_df = set_column_name(genuine_tweets_df, BOT_COLUMNS)

    #solve column number issue
    bot_tweets_df = removeExtraColumn(bot_tweets_df, BOT_COLUMNS)
    genuine_tweets_df = removeExtraColumn(genuine_tweets_df, BOT_COLUMNS)
    
    
    bot_tweets_df = remove_type_miss_match(bot_tweets_df)
    genuine_tweets_df = remove_type_miss_match(genuine_tweets_df)
    
#     print(len(bot_tweets_df.collect()), len(genuine_tweets_df.collect()))
    
    ##preprocess data
    tweets_df = resize_combine_data(bot_tweets_df, genuine_tweets_df)
    tweets_df = preprocess_data(tweets_df)
    
#     print(len(tweets_df.collect()))
#     print(tweets_df.columns)
    
    ##text embedding using GLoVE & LSTM
    ## Word Embedding
    tweets_df = sentEmbeddingGLoVE_LSTM(tweets_df)
    
    ## Assable multiple colu,ms to create feature vector
    tweets_updated_df = assembleColumns(tweets_df)
#     print(len(tweets_updated_df.collect()), tweets_updated_df.columns)

#     tweets_updated_df = tweets_updated_df.cache()
    
    X, Y = to_nparray_dataset(tweets_updated_df, 'independent_features', 'BotOrNot')
    
    grads = None
    # need more testing
    if os.path.isfile(grad_fname):
        grads = np.load(grad_fname) # load gds from s3/hdfs/file
        grads = tf.convert_to_tensor(grads, dtype='float32')
    print("GD tensor: {}".format(grads))
    
    bw0 = broadcast_w0.value
    curr_gd = generateGradient(X, Y, bw0, grads)
    print("curr_gd: {}".format(curr_gd))
    
    if grads is not None:
        grads = grads + curr_gd
    else:
        grads = curr_gd
    
    
    # ------- Need to save gradient sum in hdfs/s3 files----------
    
    np.save(grad_fname, tf.make_ndarray(grads))
#     print("Curr gd: {}".format(curr_gd))
    print(":: OK")


# distributed training / adjustment of weights
def getAdjustedWeights(weights = None, gradient = None):
    if (weights is None):
        model = get2DenseLayeredModel(38)
        return model.get_weights()
    elif (gradient is None):
        return weights
    else:
        model = get2DenseLayeredModel(38)
        model.set_weights(weights)
        opt.apply_gradients(zip(gradient, model.trainable_variables))
        
        return model.get_weights()
    

glove_dict = makeGloveWordEmbedder(GLOVE_DIR + "glove.twitter.27B.25d.txt")
broadcast_glove_dict = sc.broadcast(glove_dict)

w0 = getAdjustedWeights(None, None)
broadcast_w0 = sc.broadcast(w0)



# broadcast glove word wmbedder to all task
def broadcastData():
    print("broadcast glove")
    glove_dict = makeGloveWordEmbedder(GLOVE_DIR + "glove.twitter.27B.25d.txt")
    broadcast_glove_dict = sc.broadcast(glove_dict)


    
    
    
def ApplicationJob():
    
#     broadcastData()
#     accumulateData()
    
    bot_tweets_df, genuine_tweets_df = read_dataset()
    
    train_bot_tweet_df, test_bot_tweet_df = bot_tweets_df.randomSplit([0.8, 0.2], seed = 21)
    train_genuine_tweets_df, test_genuine_tweets_df = genuine_tweets_df.randomSplit([0.8, 0.2], seed = 21)
    
#     broadcastGloveDict()
    # split dataset for parallel data training
    num_of_thread = 10
    split_weight = 1.0 / num_of_thread
    split_weights = [split_weight] * num_of_thread
    bot_dfs = train_bot_tweet_df.randomSplit(split_weights, seed = 71)
    gen_dfs = train_genuine_tweets_df.randomSplit(split_weights, seed = 71)
    
    model_weights = getAdjustedWeights(None, None)
    ## run a task for each small model training
    for idx in range(num_of_thread):
        thread = InheritableThread(target = worker_task, kwargs={'bot_tweets_df': bot_dfs[idx],
                                                                 'genuine_tweets_df': gen_dfs[idx]})
        thread.start()
        thread.join()
        
    
    ## single worker or multiple worker
    
    # testing model
#     worker_task_eval(bot_tweets_df, genuine_tweets_df, gradient)
    thread = InheritableThread(target = worker_task_eval, kwargs={'bot_tweets_df': test_bot_tweet_df, 
                                                                  'genuine_tweets_df': test_genuine_tweets_df})
    thread.start()
    thread.join()
    
    
    

if __name__ == '__main__':
    ##load dara
    ApplicationJob()
    spark.stop()

#bot_tweets: 81 #gen_tweets: 81
GD tensor: None
Input count: 162, 162
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[ 4.76594187e-06,  4.98153076e-06, -1.76889932e-06, ...,
         2.56352678e-07,  0.00000000e+00, -1.14506274e-05],
       [-1.49151529e-04,  3.57440033e-04, -3.30246607e-04, ...,
        -3.60162107e-06,  7.77448295e-04,  0.00000000e+00],
       [ 2.12245982e-06,  2.26765187e-05,  0.00000000e+00, ...,
        -2.55766681e-05,  0.00000000e+00, -2.42143469e-05],
       ...,
       [-3.40508063e-06,  1.74718371e-05, -1.14426075e-05, ...,
        -3.49522452e-06,  1.01266523e-05, -1.58255370e-05],
       [-1.10022472e-06,  5.58906640e-06, -3.06445918e-06, ...,
        -1.97735517e-06,  2.99033559e-07, -3.53108726e-06],
       [ 4.97679684e-06, -5.56174055e-05,  4.07018633e-05, ...,
         5.96805421e-06, -3.67720204e-05,  3.20848412e-05]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-2.71071858e-05,  3.04609217e-04, -1.313

Exception in thread Thread-57 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


#bot_tweets: 64 #gen_tweets: 64


Exception in thread Thread-58 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 128, 128
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[-4.4989683e-06,  1.7703818e-05,  7.4076364e-05, ...,
        -5.0126760e-05,  0.0000000e+00, -7.5225049e-04],
       [-2.0141089e-04,  3.4414581e-04,  0.0000000e+00, ...,
         0.0000000e+00,  4.3374475e-04,  0.0000000e+00],
       [-4.4989683e-06,  1.7703818e-05,  0.0000000e+00, ...,
        -1.1793269e-05,  0.0000000e+00,  2.2047711e-06],
       ...,
       [-1.1805354e-05,  1.5994126e-05,  1.0834690e-07, ...,
         9.6129486e-07,  1.8036873e-05,  1.2445710e-06],
       [ 4.5775059e-06, -8.6654964e-06,  7.0562223e-08, ...,
        -1.9031887e-06, -9.3746830e-06,  3.5792702e-06],
       [ 1.7564291e-05, -2.3624052e-05, -1.1672505e-07, ...,
        -1.3402176e-06, -2.7191698e-05, -8.9158884e-06]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-7.68137106e-05,  3.12883640e-04,  4.16016019e-06, -3.10893462e-04,
       -1.08869615e-04,  5.43814269e-04,

#bot_tweets: 78 #gen_tweets: 78


Exception in thread Thread-59 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 156, 156
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[-8.56375573e-06,  1.03017746e-05, -1.79427261e-05, ...,
        -4.62150929e-05,  0.00000000e+00, -3.30627008e-05],
       [-7.95873784e-05, -4.59233299e-04, -3.07549373e-04, ...,
        -2.24156020e-06,  4.80326766e-04,  0.00000000e+00],
       [ 3.06590700e-05,  1.45430618e-04,  0.00000000e+00, ...,
        -7.14981288e-05,  0.00000000e+00, -7.04309641e-05],
       ...,
       [-1.13866545e-05, -1.32102605e-05, -1.83975826e-05, ...,
        -1.68446331e-05,  2.76272513e-05, -4.58842296e-05],
       [-1.25401675e-06, -1.50862352e-05, -1.44621272e-05, ...,
         8.46962757e-06,  1.84653964e-05,  1.60831514e-05],
       [ 4.20619926e-06,  5.14619114e-06,  1.73639273e-05, ...,
         9.30241913e-06, -1.99792321e-05,  3.33682547e-05]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-6.05624482e-05,  1.47286686e-04, -6.82435857e-05, -3.66110646e-04,
    

#bot_tweets: 75 #gen_tweets: 75


Exception in thread Thread-60 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 150, 150
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[-6.4059623e-06,  1.8180854e-05,  0.0000000e+00, ...,
        -3.6549500e-06,  0.0000000e+00, -2.0609992e-05],
       [-7.8590158e-05,  4.8055890e-04, -1.1801097e-06, ...,
         0.0000000e+00,  5.5730186e-04,  0.0000000e+00],
       [ 1.0296120e-05,  9.0378839e-05,  0.0000000e+00, ...,
        -1.9468001e-05,  0.0000000e+00, -4.4960536e-05],
       ...,
       [-2.6017105e-06,  7.9266565e-06,  1.1116236e-06, ...,
        -1.3559185e-06,  4.4415597e-06, -3.5074834e-06],
       [ 1.7633156e-05, -8.1174869e-05,  3.6865208e-06, ...,
         1.1400798e-05, -2.7491133e-05,  1.2202827e-05],
       [ 1.1143098e-05, -6.3750529e-05,  4.0831796e-06, ...,
         9.5216365e-06, -1.4843808e-05,  7.9293613e-06]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-6.39804930e-05,  4.00831894e-04, -1.04207847e-05, -2.19675843e-04,
        4.80319613e-05,  4.41680430e-04,

#bot_tweets: 83 #gen_tweets: 83


Exception in thread Thread-61 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 166, 166
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[ 7.3846309e-06,  7.0978233e-05,  0.0000000e+00, ...,
        -1.3703023e-05,  0.0000000e+00, -3.4795634e-05],
       [ 3.4577715e-05, -1.0639381e-04,  0.0000000e+00, ...,
         0.0000000e+00,  5.0990551e-05, -3.2270487e-05],
       [ 3.4148357e-05,  2.2642677e-04,  0.0000000e+00, ...,
        -6.1237733e-05,  0.0000000e+00, -1.5474285e-04],
       ...,
       [-3.4394004e-06, -7.7649192e-06,  1.0614461e-07, ...,
        -1.5117528e-06, -4.1725057e-06,  6.4064484e-06],
       [-8.6948930e-06, -2.8478426e-05,  3.4544502e-07, ...,
         7.5100061e-06, -7.5646744e-06,  1.1357422e-05],
       [ 6.0642251e-06, -2.8529480e-06, -3.4058425e-07, ...,
        -5.0439257e-06,  1.0559437e-05, -9.2688815e-06]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([ 4.67359787e-05,  4.01013647e-04, -2.30473916e-06, -4.65370103e-04,
        3.81514765e-05,  2.37715140e-04,

#bot_tweets: 74 #gen_tweets: 74


Exception in thread Thread-62 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 148, 148
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[-1.50373035e-05,  3.74854892e-04, -3.75054651e-06, ...,
        -2.99862266e-04,  0.00000000e+00, -6.96445495e-05],
       [-2.19240377e-04,  8.95078119e-05, -1.35321297e-05, ...,
         0.00000000e+00,  5.16595785e-04, -7.67506208e-06],
       [-1.80138813e-05,  6.29229296e-04,  0.00000000e+00, ...,
        -4.50561958e-04,  0.00000000e+00, -8.42956943e-05],
       ...,
       [ 9.59533850e-07, -7.90193553e-06, -2.60897014e-06, ...,
        -7.55928113e-07,  6.07567154e-06,  1.23791388e-05],
       [-4.86405315e-06, -3.93159098e-07, -1.22418496e-05, ...,
        -1.66816317e-05,  2.41983016e-05, -1.67378676e-05],
       [-9.18864680e-06,  1.16665706e-05, -2.09906211e-05, ...,
        -3.71209280e-05,  5.08798003e-05, -4.83661461e-05]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-5.12084371e-05,  1.88974809e-04, -5.77357387e-05, -1.73746521e-04,
    

#bot_tweets: 74 #gen_tweets: 74


Exception in thread Thread-63 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 148, 148
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[ 1.1339723e-06,  5.7005254e-06, -1.1030616e-04, ...,
        -7.4289645e-05,  0.0000000e+00, -8.1755628e-04],
       [-1.6920756e-04, -3.5231584e-05,  0.0000000e+00, ...,
         0.0000000e+00,  1.0572632e-03,  0.0000000e+00],
       [-6.0856687e-06,  1.8554279e-05,  0.0000000e+00, ...,
        -1.2473950e-05,  0.0000000e+00, -4.4501112e-06],
       ...,
       [ 1.4521673e-05, -4.9250528e-05, -7.0096007e-08, ...,
        -1.8970948e-06, -8.3069710e-05,  3.0575160e-05],
       [-1.5398877e-05,  5.6008597e-05, -1.2299165e-07, ...,
         5.3792919e-06,  1.3376927e-04, -3.3884207e-05],
       [ 1.9590778e-05, -7.1124326e-05, -8.6067509e-08, ...,
        -7.5270750e-06, -1.6404776e-04,  3.7167556e-05]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-1.42135235e-04,  5.10839338e-04, -5.54585949e-06, -3.29099130e-04,
       -8.82674285e-05,  7.41389813e-04,

#bot_tweets: 79 #gen_tweets: 79


Exception in thread Thread-64 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 158, 158
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[ 7.9814936e-07,  0.0000000e+00,  4.0838181e-06, ...,
         0.0000000e+00,  0.0000000e+00, -1.4859423e-05],
       [-4.7994563e-06,  2.4357683e-04, -2.1904384e-04, ...,
         0.0000000e+00,  6.9588117e-05, -2.6568116e-05],
       [ 6.5381857e-05,  1.3701443e-04,  0.0000000e+00, ...,
        -6.0558836e-05,  0.0000000e+00, -4.6275563e-05],
       ...,
       [-5.7306042e-06, -6.2389816e-05,  3.9811501e-05, ...,
         8.8181014e-07, -4.4198321e-05,  4.7752052e-05],
       [ 7.9921447e-06, -1.2123391e-05, -8.3450923e-06, ...,
        -1.0435749e-06, -2.1645801e-05,  2.7108759e-05],
       [ 6.2205536e-06, -2.0613506e-06, -9.1090669e-06, ...,
        -3.4775469e-06, -7.7169962e-06,  6.7016736e-06]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([ 1.70360818e-05,  4.29664913e-04, -1.05933897e-04, -4.45153011e-04,
       -6.63721003e-05,  3.11740208e-04,

#bot_tweets: 90 #gen_tweets: 90


Exception in thread Thread-65 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 180, 180
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[ 1.6391491e-06,  1.5524889e-05, -7.0532849e-07, ...,
        -7.1491240e-06,  0.0000000e+00, -1.5926262e-05],
       [-2.1918686e-04,  6.1913255e-05,  0.0000000e+00, ...,
         8.8820834e-06,  1.4062285e-04,  0.0000000e+00],
       [ 4.5646311e-06,  9.5180629e-05,  0.0000000e+00, ...,
        -6.0724640e-05,  0.0000000e+00, -6.3252461e-05],
       ...,
       [-5.2863892e-05,  1.9853540e-05,  2.6326092e-08, ...,
        -1.3662526e-05,  2.5774905e-05,  3.9433638e-07],
       [-5.2015159e-05,  2.9091527e-05, -2.4367584e-08, ...,
        -1.6451577e-05,  2.2208395e-05, -2.4899182e-06],
       [ 6.2583233e-05, -2.1908256e-05,  1.6940755e-08, ...,
         1.3321067e-05, -2.5957605e-05,  2.4042772e-06]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-2.29947997e-04,  3.51928174e-04, -8.81121082e-07, -4.65684134e-04,
       -2.62334303e-04,  4.93785541e-04,

#bot_tweets: 94 #gen_tweets: 94


Exception in thread Thread-66 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 612, in worker_task
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 592, in MakeNdarray
    shape = [d.size for d in tensor.tensor_shape.dim]
AttributeError: 'list' object has no attribute 'tensor_shape'


GD tensor: None
Input count: 188, 188
curr_gd: [<tf.Tensor: shape=(38, 500), dtype=float32, numpy=
array([[ 2.6446762e-05,  2.4501111e-05,  7.0384272e-06, ...,
        -5.2877922e-06,  0.0000000e+00, -4.4314267e-05],
       [-2.3374916e-04,  2.0501032e-04,  0.0000000e+00, ...,
         0.0000000e+00,  5.0430273e-04,  0.0000000e+00],
       [ 1.9680381e-05,  3.9970444e-05,  0.0000000e+00, ...,
        -1.0182670e-05,  0.0000000e+00, -2.6118400e-05],
       ...,
       [ 5.6423974e-06, -3.7782804e-05,  5.1525393e-07, ...,
        -1.5306198e-06, -4.6886253e-06,  2.5914396e-05],
       [-2.5169627e-05,  8.1817110e-05, -7.7817890e-07, ...,
        -3.6066267e-05,  1.1672846e-05, -4.8160662e-05],
       [ 1.3607786e-05, -7.1579161e-05,  3.0415859e-07, ...,
         1.0321225e-05, -8.2399711e-06,  4.5193690e-05]], dtype=float32)>, <tf.Tensor: shape=(500,), dtype=float32, numpy=
array([-1.37120980e-04,  5.77828672e-04, -1.91868867e-06, -3.59174592e-04,
       -1.61195683e-04,  6.89306296e-04,

Exception in thread Thread-67 (copy_local_properties):
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\util.py", line 377, in copy_local_properties
    return target(*a, **k)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_7644\3473066317.py", line 528, in worker_task_eval
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\lib\npyio.py", line 390, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))
FileNotFoundError: [Errno 2] No such file or directory: 'grads.npy'


In [44]:
spark.stop()

In [18]:
a = np.empty([], dtype='float32')

In [21]:
a

array(6.e-45, dtype=float32)

In [31]:
import os
os.path.isfile(os.getcwd()+"\\data\\"+"grad_sum.txt")

True

In [24]:
grad_fname

'grads.npy'

In [30]:
os.getcwd()+"\\data\\"+grad_fname

'C:\\Users\\USER\\projects\\Twitter-Bot-or-Not\\data\\grads.npy'

In [None]:
if os.path.isfile(grad_fname):
        grads = np.load(grad_fname) # load gds from s3/hdfs/file
        grads = tf.convert_to_tensor(grads, dtype='float32')
else:
    print("")