In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#create Spark session
spark = SparkSession.builder.appName('Stackoverflow_Project').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '5g'), 
                                        ('spark.app.name', 'Spark Updated Conf'), 
                                        ('spark.executor.cores', '4'), 
                                        ('spark.cores.max', '4'), 
                                        ('spark.driver.memory','8g')])

In [3]:
#Read the cleaned and pre-processed data from the GCS bucket
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .parquet("gs://msca-bdp-student-gcs/Group6/extracted_StackOverflow.parquet",inferSchema=True, header=True )

                                                                                

In [4]:
df.count()

                                                                                

2605413

In [5]:
df.show(5)

24/12/01 05:50:17 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+--------+--------------------+--------------------+-----------------------+-----------------+------------------+------------------+-------------------+-----------------------+-------------------+------------------------+------------------+-----------------+----------+--------------------+---------------+---------+--------------------+--------------------+--------------------+-------------------------+---------------------+--------------------------+--------------------+----------------+-------------------+------------+--------------------+--------------------+
| post_id|          post_title|           post_body|post_accepted_answer_id|post_answer_count|post_comment_count|post_creation_date|post_favorite_count|post_last_activity_date|post_last_edit_date|post_last_editor_user_id|post_owner_user_id|post_post_type_id|post_score|           post_tags|post_view_count|answer_id|         answer_body|answer_comment_count|answer_creation_date|answer_last_activity_date|answer_last_edit_date|answe

                                                                                

In [6]:
df.printSchema()

root
 |-- post_id: long (nullable = true)
 |-- post_title: string (nullable = true)
 |-- post_body: string (nullable = true)
 |-- post_accepted_answer_id: long (nullable = true)
 |-- post_answer_count: long (nullable = true)
 |-- post_comment_count: long (nullable = true)
 |-- post_creation_date: date (nullable = true)
 |-- post_favorite_count: long (nullable = true)
 |-- post_last_activity_date: date (nullable = true)
 |-- post_last_edit_date: date (nullable = true)
 |-- post_last_editor_user_id: long (nullable = true)
 |-- post_owner_user_id: long (nullable = true)
 |-- post_post_type_id: long (nullable = true)
 |-- post_score: long (nullable = true)
 |-- post_tags: string (nullable = true)
 |-- post_view_count: long (nullable = true)
 |-- answer_id: long (nullable = true)
 |-- answer_body: string (nullable = true)
 |-- answer_comment_count: long (nullable = true)
 |-- answer_creation_date: long (nullable = true)
 |-- answer_last_activity_date: date (nullable = true)
 |-- answer_last

In [7]:
#Count rows with missing values
df.dropna().count() #no duplicates

                                                                                

369090

In [8]:
#Data is at answers level, aggregate to get at post level
df_2 = df.select('post_body_text','post_tags') \
         .groupBy('post_body_text','post_tags').count()

df_2 = df_2.withColumnRenamed('count', 'Count of Answers')
df_2.show(5)



+--------------------+--------------------+----------------+
|      post_body_text|           post_tags|Count of Answers|
+--------------------+--------------------+----------------+
| is there an easy...|     c|struct|unions|               7|
| my oracle databa...|.net|database|ora...|               5|
| given an integer...|java|parsing|integer|               7|
| why are the stat...|                java|               6|
| the above code g...|java|multithreadi...|               6|
+--------------------+--------------------+----------------+
only showing top 5 rows



                                                                                

In [9]:
df_2 = df_2.filter(df_2["post_body_text"].isNotNull())
df_2 = df_2.dropDuplicates(["post_body_text"])
df_2.count()

                                                                                

367336

In [10]:
df_2.filter(df_2["post_tags"].isNull()).count()

                                                                                

0

In [11]:
df_2.rdd.getNumPartitions()



9

In [12]:
df_2 = df_2.repartition(40)

In [13]:
from pyspark.sql.functions import split, explode, col, lit, array_contains

#Split tags into an array
df_2 = df_2.withColumn("tags_array", split(col("post_tags"), "\|"))

#Explode tags and count frequencies
exploded_df = df_2.select('post_body_text','post_tags',explode(col("tags_array")).alias("tag"))
tag_counts = exploded_df.groupBy("tag").count().orderBy(col("count").desc())

In [14]:
exploded_df.show(5)



+--------------------+-------------------+------+
|      post_body_text|          post_tags|   tag|
+--------------------+-------------------+------+
| i m trying to st...|c++|c|system|stdout|   c++|
| i m trying to st...|c++|c|system|stdout|     c|
| i m trying to st...|c++|c|system|stdout|system|
| i m trying to st...|c++|c|system|stdout|stdout|
| i have a table w...|          sql|mysql|   sql|
+--------------------+-------------------+------+
only showing top 5 rows



                                                                                

In [15]:
#Select top 50 tags
top_50_tags = tag_counts.limit(50).select("tag").rdd.flatMap(lambda x: x).collect()
top_50_tags

                                                                                

['java',
 'c#',
 'javascript',
 'php',
 'c++',
 'jquery',
 'html',
 'python',
 'css',
 'android',
 'c',
 '.net',
 'sql',
 'mysql',
 'arrays',
 'asp.net',
 'string',
 'sql-server',
 'iphone',
 'ios',
 'regex',
 'objective-c',
 'algorithm',
 'ruby',
 'performance',
 'database',
 'linux',
 'ruby-on-rails',
 'windows',
 'list',
 'multithreading',
 'oop',
 'bash',
 'eclipse',
 'asp.net-mvc',
 'ajax',
 'perl',
 'json',
 'pointers',
 'visual-studio',
 'xml',
 'winforms',
 'linq',
 'function',
 'class',
 'tsql',
 'vb.net',
 'wpf',
 'xcode',
 'language-agnostic']

In [16]:
#Filter data for top 50 tags
exploded_df_filtered = exploded_df.filter(col('tag').isin(top_50_tags))
exploded_df_filtered.count()

                                                                                

502984

In [17]:
#Pivot data
exploded_df_filtered = exploded_df_filtered.groupBy("post_body_text","post_tags").pivot("tag").count()
exploded_df_filtered.printSchema()



root
 |-- post_body_text: string (nullable = true)
 |-- post_tags: string (nullable = true)
 |-- .net: long (nullable = true)
 |-- ajax: long (nullable = true)
 |-- algorithm: long (nullable = true)
 |-- android: long (nullable = true)
 |-- arrays: long (nullable = true)
 |-- asp.net: long (nullable = true)
 |-- asp.net-mvc: long (nullable = true)
 |-- bash: long (nullable = true)
 |-- c: long (nullable = true)
 |-- c#: long (nullable = true)
 |-- c++: long (nullable = true)
 |-- class: long (nullable = true)
 |-- css: long (nullable = true)
 |-- database: long (nullable = true)
 |-- eclipse: long (nullable = true)
 |-- function: long (nullable = true)
 |-- html: long (nullable = true)
 |-- ios: long (nullable = true)
 |-- iphone: long (nullable = true)
 |-- java: long (nullable = true)
 |-- javascript: long (nullable = true)
 |-- jquery: long (nullable = true)
 |-- json: long (nullable = true)
 |-- language-agnostic: long (nullable = true)
 |-- linq: long (nullable = true)
 |-- linux:

                                                                                

In [18]:
exploded_df_filtered.count()

                                                                                

318647

In [19]:
#The dot character triggers an error when used in column names. So rename these columns.
exploded_df_filtered = exploded_df_filtered.withColumnRenamed('.net', 'dot_net')
exploded_df_filtered = exploded_df_filtered.withColumnRenamed('asp.net-mvc', 'asp_dot_net-mvc')
exploded_df_filtered = exploded_df_filtered.withColumnRenamed('asp.net', 'asp_dot_net')
exploded_df_filtered = exploded_df_filtered.withColumnRenamed('vb.net', 'vb_dot_net')

In [20]:
#Also rename in tags list
top_50_tags.remove('.net')
top_50_tags.remove('asp.net-mvc')
top_50_tags.remove('asp.net')
top_50_tags.remove('vb.net')
top_50_tags = top_50_tags + ['asp_dot_net','asp_dot_net-mvc','vb_dot_net','dot_net']
top_50_tags

['java',
 'c#',
 'javascript',
 'php',
 'c++',
 'jquery',
 'html',
 'python',
 'css',
 'android',
 'c',
 'sql',
 'mysql',
 'arrays',
 'string',
 'sql-server',
 'iphone',
 'ios',
 'regex',
 'objective-c',
 'algorithm',
 'ruby',
 'performance',
 'database',
 'linux',
 'ruby-on-rails',
 'windows',
 'list',
 'multithreading',
 'oop',
 'bash',
 'eclipse',
 'ajax',
 'perl',
 'json',
 'pointers',
 'visual-studio',
 'xml',
 'winforms',
 'linq',
 'function',
 'class',
 'tsql',
 'wpf',
 'xcode',
 'language-agnostic',
 'asp_dot_net',
 'asp_dot_net-mvc',
 'vb_dot_net',
 'dot_net']

In [21]:
#Since there are lot of null values because of pivoting the df, fill null values with 0s
exploded_df_filtered = exploded_df_filtered.na.fill(value = 0)
exploded_df_filtered.show(5)



+--------------------+--------------------+-------+----+---------+-------+------+-----------+---------------+----+---+---+---+-----+---+--------+-------+--------+----+---+------+----+----------+------+----+-----------------+----+-----+----+--------------+-----+-----------+---+-----------+----+---+--------+------+-----+----+-------------+---+----------+------+----+----------+-------------+-------+--------+---+-----+---+
|      post_body_text|           post_tags|dot_net|ajax|algorithm|android|arrays|asp_dot_net|asp_dot_net-mvc|bash|  c| c#|c++|class|css|database|eclipse|function|html|ios|iphone|java|javascript|jquery|json|language-agnostic|linq|linux|list|multithreading|mysql|objective-c|oop|performance|perl|php|pointers|python|regex|ruby|ruby-on-rails|sql|sql-server|string|tsql|vb_dot_net|visual-studio|windows|winforms|wpf|xcode|xml|
+--------------------+--------------------+-------+----+---------+-------+------+-----------+---------------+----+---+---+---+-----+---+--------+-------+-

                                                                                

### Create pipeline for feature engineering/data transformation

In [24]:
#Spark ML imports
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Tokenize the data into words
tokenizer = Tokenizer(inputCol="post_body_text", outputCol="Words")

#Remove stop words
remove_stopwords = StopWordsRemover(inputCol="Words", outputCol="Filtered_Words")

#HashingTF
hashing_tf = HashingTF(inputCol="Filtered_Words", outputCol="Hashing_TF_Features")

#IDF
idf = IDF(inputCol="Hashing_TF_Features", outputCol="Hashing_TFIDF_Features")

#Creating a pipeline to transform the data and prepare it for the model
pipeline = Pipeline(stages=[tokenizer, remove_stopwords, hashing_tf, idf])

In [25]:
#Fit and transform the data using the pipeline
pipeline_final = pipeline.fit(exploded_df_filtered)
model_df = pipeline_final.transform(exploded_df_filtered)
model_df.printSchema()

[Stage 166:====>                                                  (1 + 10) / 13]

root
 |-- post_body_text: string (nullable = true)
 |-- post_tags: string (nullable = true)
 |-- dot_net: long (nullable = true)
 |-- ajax: long (nullable = true)
 |-- algorithm: long (nullable = true)
 |-- android: long (nullable = true)
 |-- arrays: long (nullable = true)
 |-- asp_dot_net: long (nullable = true)
 |-- asp_dot_net-mvc: long (nullable = true)
 |-- bash: long (nullable = true)
 |-- c: long (nullable = true)
 |-- c#: long (nullable = true)
 |-- c++: long (nullable = true)
 |-- class: long (nullable = true)
 |-- css: long (nullable = true)
 |-- database: long (nullable = true)
 |-- eclipse: long (nullable = true)
 |-- function: long (nullable = true)
 |-- html: long (nullable = true)
 |-- ios: long (nullable = true)
 |-- iphone: long (nullable = true)
 |-- java: long (nullable = true)
 |-- javascript: long (nullable = true)
 |-- jquery: long (nullable = true)
 |-- json: long (nullable = true)
 |-- language-agnostic: long (nullable = true)
 |-- linq: long (nullable = true)


                                                                                

In [26]:
#Remove duplicates
model_df = model_df.dropDuplicates(["post_body_text"])
model_df = model_df.filter(model_df["post_body_text"].isNotNull())
model_df = model_df.filter(model_df["post_tags"].isNotNull())
model_df.count()

                                                                                

318649

In [27]:
# Split the data into train (70%), test (20%), and validation (10%) sets
train_df, test_df, val_df = model_df.randomSplit([0.7, 0.2, 0.1], seed=11)

In [28]:
# Print the count of each split to verify
print(f"Train set count: {train_df.count()}")
print(f"Test set count: {test_df.count()}")
print(f"Validation set count: {val_df.count()}")

24/12/01 05:52:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/01 05:53:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

Train set count: 222733


24/12/01 05:53:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/01 05:53:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

Test set count: 64069


24/12/01 05:53:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/01 05:53:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB

Validation set count: 31846


                                                                                

In [29]:
train_df.rdd.getNumPartitions()

24/12/01 05:54:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB

200

In [30]:
test_df.rdd.getNumPartitions()

24/12/01 05:54:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB

200

In [31]:
val_df.rdd.getNumPartitions()

24/12/01 05:54:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB

200

### Train the models for each of 50 tags

In [32]:
#Training/Testing loop for the model

def fit_classification_model(tag):
    
    #Logistic Regression model
    Log_reg_htfidf = LogisticRegression(maxIter=5, featuresCol = "Hashing_TFIDF_Features", labelCol = tag)
    
    #Fit the model on the training dataset
    Log_reg_htfidf_model = Log_reg_htfidf.fit(train_df)
    
    #Transform/make predictions using both models and rename columns for better understanding
    predictions_htfidf = Log_reg_htfidf_model.transform(test_df)
    predictions_htfidf = predictions_htfidf.withColumnRenamed("prediction", "predictions_htfidf") \
                                     .withColumnRenamed("rawPrediction", "rawPredictions_htfidf") \
                                     .withColumnRenamed("probability", "probability_htfidf")
    
    #Evaluate and calculate metrics for both CV and Hashing TFs
    eval1 = MulticlassClassificationEvaluator(labelCol = tag, predictionCol = "predictions_htfidf")
    accuracy_htfidf = eval1.evaluate(predictions_htfidf, {eval1.metricName: "accuracy"})
    f1_htfidf = eval1.evaluate(predictions_htfidf, {eval1.metricName: "f1"})
    
    #Return metrics and models
    return {
        "tag": tag,
        "Log_reg_htfidf_model": Log_reg_htfidf_model,
        "accuracy": accuracy_htfidf,
        "f1_score": f1_htfidf
    }

In [None]:
#Evaluate each tag and store the models/eval metrics
eval_results = []
for tag in top_50_tags:
    eval_result = fit_classification_model(tag)
    eval_results.append(eval_result)

24/12/01 05:55:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/01 05:55:29 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/01 05:55:41 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/12/01 05:55:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/12/01 05:55:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/12/01 05:55:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/12/01 05:55:54 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/01 05:55:54 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/01 05:55:54 WARN org.apache.spark.scheduler.DAGSchedule

In [None]:
#Logistic regression models and evaluation metrics for each of the 50 tags
eval_results

[{'tag': 'java',
  'Log_reg_htfidf_model': LogisticRegressionModel: uid=LogisticRegression_1480097e889f, numClasses=2, numFeatures=262144,
  'accuracy': 0.883612721290929,
  'f1_score': 0.873453685938679},
 {'tag': 'c#',
  'Log_reg_htfidf_model': LogisticRegressionModel: uid=LogisticRegression_2f4e37144f6a, numClasses=2, numFeatures=262144,
  'accuracy': 0.8682751057496475,
  'f1_score': 0.8511188294230159},
 {'tag': 'javascript',
  'Log_reg_htfidf_model': LogisticRegressionModel: uid=LogisticRegression_7350de5d931b, numClasses=2, numFeatures=262144,
  'accuracy': 0.8965846780510731,
  'f1_score': 0.8817062882941256},
 {'tag': 'php',
  'Log_reg_htfidf_model': LogisticRegressionModel: uid=LogisticRegression_583c9c269abb, numClasses=2, numFeatures=262144,
  'accuracy': 0.9229672567758107,
  'f1_score': 0.9152943655644681},
 {'tag': 'c++',
  'Log_reg_htfidf_model': LogisticRegressionModel: uid=LogisticRegression_13b32d4f96a6, numClasses=2, numFeatures=262144,
  'accuracy': 0.9204593523320

In [35]:
from pyspark.sql import Row

#Convert results to PySpark DataFrame
metrics_data = sc.parallelize([
    Row(
        tag=eval_result["tag"],
        accuracy=eval_result["accuracy"],
        f1_score=eval_result["f1_score"]
    )
    for eval_result in eval_results
])

metrics_df = spark.createDataFrame(metrics_data)
metrics_df.show(truncate=False)

[Stage 11074:>                                                      (0 + 1) / 1]

+-----------+------------------+------------------+
|tag        |accuracy          |f1_score          |
+-----------+------------------+------------------+
|java       |0.883612721290929 |0.873453685938679 |
|c#         |0.8682751057496475|0.8511188294230159|
|javascript |0.8965846780510731|0.8817062882941256|
|php        |0.9229672567758107|0.9152943655644681|
|c++        |0.9204593523320121|0.9059030163591125|
|jquery     |0.9342315525614915|0.9260444036093863|
|html       |0.9328372238759204|0.9227061164805903|
|python     |0.9472661757794141|0.9416879525096598|
|css        |0.9628544571518095|0.9593307021012154|
|android    |0.965423781920727 |0.9622272129831217|
|c          |0.9482523617791286|0.9378237543486118|
|sql        |0.9591414695284349|0.9543531518660665|
|mysql      |0.9621958670823607|0.9551657135307843|
|arrays     |0.9643897853673821|0.9580272325103403|
|string     |0.9628382083379028|0.960446622597368 |
|sql-server |0.9741657527808241|0.9686920348980156|
|iphone     

                                                                                

In [49]:
metrics_df.show(50,truncate=False)

[Stage 11287:>                                                      (0 + 1) / 1]

+-----------------+------------------+------------------+
|tag              |accuracy          |f1_score          |
+-----------------+------------------+------------------+
|java             |0.883612721290929 |0.873453685938679 |
|c#               |0.8682751057496475|0.8511188294230159|
|javascript       |0.8965846780510731|0.8817062882941256|
|php              |0.9229672567758107|0.9152943655644681|
|c++              |0.9204593523320121|0.9059030163591125|
|jquery           |0.9342315525614915|0.9260444036093863|
|html             |0.9328372238759204|0.9227061164805903|
|python           |0.9472661757794141|0.9416879525096598|
|css              |0.9628544571518095|0.9593307021012154|
|android          |0.965423781920727 |0.9622272129831217|
|c                |0.9482523617791286|0.9378237543486118|
|sql              |0.9591414695284349|0.9543531518660665|
|mysql            |0.9621958670823607|0.9551657135307843|
|arrays           |0.9643897853673821|0.9580272325103403|
|string       

                                                                                

### Save the models

In [48]:
for i in range(0,50):
    tag = eval_results[i]['tag']
    model = eval_results[i]['Log_reg_htfidf_model']

    # Path to save the model in GCS
    gcs_model_path = f'gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_{tag}'

    # Save the trained logistic regression model
    model.save(gcs_model_path)
    print(f"{tag} model saved to {gcs_model_path}")

24/12/01 07:09:39 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11088 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

java model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_java


24/12/01 07:09:41 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11092 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

c# model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_c#


24/12/01 07:09:44 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11096 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

javascript model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_javascript


24/12/01 07:09:46 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11100 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

php model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_php


24/12/01 07:09:49 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11104 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

c++ model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_c++


24/12/01 07:09:51 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11108 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

jquery model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_jquery


24/12/01 07:09:54 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11112 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

html model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_html


24/12/01 07:09:57 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11116 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

python model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_python


24/12/01 07:10:00 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11120 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

css model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_css


24/12/01 07:10:02 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11124 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

android model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_android


24/12/01 07:10:05 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11128 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

c model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_c


24/12/01 07:10:08 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11132 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

sql model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_sql


24/12/01 07:10:10 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11136 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

mysql model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_mysql


24/12/01 07:10:13 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11140 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

arrays model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_arrays


24/12/01 07:10:15 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11144 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

string model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_string


24/12/01 07:10:18 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11148 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

sql-server model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_sql-server


24/12/01 07:10:20 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11152 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

iphone model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_iphone


24/12/01 07:10:23 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11156 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

ios model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_ios


24/12/01 07:10:25 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11160 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

regex model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_regex


24/12/01 07:10:28 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11164 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

objective-c model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_objective-c


24/12/01 07:10:30 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11168 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

algorithm model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_algorithm


24/12/01 07:10:33 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11172 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

ruby model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_ruby


24/12/01 07:10:35 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11176 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

performance model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_performance


24/12/01 07:10:38 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11180 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

database model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_database


24/12/01 07:10:40 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11184 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

linux model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_linux


24/12/01 07:10:43 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11188 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

ruby-on-rails model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_ruby-on-rails


24/12/01 07:10:45 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11192 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

windows model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_windows


24/12/01 07:10:48 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11196 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

list model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_list


24/12/01 07:10:50 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11200 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

multithreading model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_multithreading


24/12/01 07:10:53 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11204 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

oop model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_oop


24/12/01 07:10:56 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11208 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

bash model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_bash


24/12/01 07:10:58 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11212 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

eclipse model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_eclipse


24/12/01 07:11:00 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11216 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

ajax model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_ajax


24/12/01 07:11:03 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11220 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

perl model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_perl


24/12/01 07:11:05 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11224 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

json model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_json


24/12/01 07:11:08 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11228 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

pointers model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_pointers


24/12/01 07:11:10 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11232 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

visual-studio model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_visual-studio


24/12/01 07:11:12 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11236 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

xml model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_xml


24/12/01 07:11:15 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11240 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

winforms model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_winforms


24/12/01 07:11:17 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11244 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

linq model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_linq


24/12/01 07:11:20 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11248 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

function model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_function


24/12/01 07:11:22 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11252 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

class model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_class


24/12/01 07:11:25 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11256 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

tsql model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_tsql


24/12/01 07:11:27 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11260 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

wpf model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_wpf


24/12/01 07:11:30 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11264 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

xcode model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_xcode


24/12/01 07:11:33 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11268 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

language-agnostic model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_language-agnostic


24/12/01 07:11:35 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11272 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

asp_dot_net model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_asp_dot_net


24/12/01 07:11:44 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11276 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

asp_dot_net-mvc model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_asp_dot_net-mvc


24/12/01 07:11:47 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11280 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

vb_dot_net model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_vb_dot_net


24/12/01 07:11:49 WARN org.apache.spark.scheduler.TaskSetManager: Stage 11284 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

dot_net model saved to gs://msca-bdp-student-gcs/Group6/Tag_classification_models/Log_Reg_dot_net


24/12/01 07:13:37 ERROR org.apache.spark.scheduler.cluster.YarnScheduler: Lost executor 13 on hub-hub-msca-bdp-dphub-student-mitalivipin-sw-dp8t.c.msca-bdp-student-ap.internal: Container marked as failed: container_1733031886451_0001_01_000014 on host: hub-hub-msca-bdp-dphub-student-mitalivipin-sw-dp8t.c.msca-bdp-student-ap.internal. Exit status: -100. Diagnostics: Container released on a *lost* node.
24/12/01 07:13:37 WARN org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 13 for reason Container marked as failed: container_1733031886451_0001_01_000014 on host: hub-hub-msca-bdp-dphub-student-mitalivipin-sw-dp8t.c.msca-bdp-student-ap.internal. Exit status: -100. Diagnostics: Container released on a *lost* node.
