<a href="https://colab.research.google.com/github/nssamson/dataChallenge/blob/master/Copie_de_dssp24_datacamp_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install pyspark
!pip install nltk

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=2e4f1c9fb4af8be62a0b121d4b4f98f69018ffe67ecd3b0f047103b71ae15297
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StringType
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import udf
from functools import partial

spark = SparkSession.builder.appName('datacamp').getOrCreate()
sc = spark.sparkContext

#dataRDD = sc.textFile('smallset.tsv').map(lambda x:x.strip().split('\t'))
dataRDD = sc.textFile('/content/drive/MyDrive/DSSP/DSSP_2024-02-09_FinalDataCamp/smallset.tsv').map(lambda x:x.strip().split('\t'))

In [None]:

dataDF=dataRDD.toDF(['id','title','body','tags'])
custom_udf_schema = ArrayType(StringType())
split_string_udf = udf(lambda x:x.split(','),custom_udf_schema)
dataDF = dataDF.withColumn('array_tags',split_string_udf(dataDF.tags))
dataDF = dataDF.drop(dataDF.tags)
#dataDF.show()


possible_tags = sc.broadcast([u'javascript', u'css', u'jquery', u'html'])


def array_string_transform(row,tags):
	data = row.asDict()
	for tag in tags.value:
		data[tag] = 0
	for existing_tag in data['array_tags']:
		data[existing_tag] = 1
	newRow = Row(*data.keys())
	newRow = newRow(*data.values())
	return newRow


mapFunction = partial(array_string_transform,tags=possible_tags)
dataDF = dataDF.rdd.map(mapFunction).toDF()

#########################################################
# Features Examples
#########################################################

#Classic TF-IDF (with hashing)
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

#1. split text field into words (Tokenize)


import re
# remove punctuation
udf_punctuation_remover = udf(lambda x: re.sub(r'[^\w\s]', ' ', x), returnType=StringType())
dataDF = dataDF.withColumn('title_no_punctuation', udf_punctuation_remover(dataDF.title))
dataDF = dataDF.withColumn('body_no_punctuation', udf_punctuation_remover(dataDF.body))

# title tokenizer
tokenizer = Tokenizer(inputCol="title_no_punctuation", outputCol="words_title")
dataDF = tokenizer.transform(dataDF)



#print ("New column with tokenized text")
#print (dataDF.show())

# stemmer / stopwords
import string
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
              'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
              'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
              'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
              'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
              'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
              'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
              'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
              'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
              'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
              'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
              'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now','br']
punctuation = set(string.punctuation)
def stemmer_stopwords (doc):
  return [stemmer.stem(w) for w in doc if ((w not in stopwords) and (w not in punctuation))]
udf_stemmer_stopwords = udf(lambda x: stemmer_stopwords(x), returnType=ArrayType(StringType()))
dataDF = dataDF.withColumn('words_title', udf_stemmer_stopwords(dataDF.words_title))
dataDF = dataDF.withColumn('words_body', udf_stemmer_stopwords(dataDF.body))

#2. compute term frequencies
hashingTF = HashingTF(inputCol="words_title", outputCol="tf_title",numFeatures=100000)
dataDF = hashingTF.transform(dataDF)
hashingTF = HashingTF(inputCol="words_body", outputCol="tf_body",numFeatures=100000)
dataDF = hashingTF.transform(dataDF)


#dataDF.show()

print ("################")

#3. IDF computation

def IDF_compute(inputCol,outputCol,dataDF):
  idf = IDF(inputCol=inputCol, outputCol=outputCol)
  idfModel = idf.fit(dataDF)
  return idfModel.transform(dataDF)

from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.feature import ChiSqSelector

def chi_2(features_col,output_col,label_col,num_features,train_df,test_df):
    chi_model = ChiSqSelector(featuresCol=features_col, outputCol=output_col, labelCol=label_col, numTopFeatures=num_features).fit(train_df)
    train_df = chi_model.transform(train_df)
    test_df = chi_model.transform(test_df)
    return train_df, test_df

from pyspark.ml.feature import PCA

def pca(features_col,output_col,k,train_df,test_df):
    svd= PCA(k=k, inputCol=features_col, outputCol=output_col).fit(train_df)
    train_df = svd.transform(train_df)
    test_df = svd.transform(test_df)
    return train_df, test_df





################


In [None]:

##########------------------------#########################################################@

dataDF = IDF_compute("tf_title", "tf_idf_title", dataDF)
dataDF = IDF_compute("tf_body", "tf_idf_body", dataDF)
#dataDF.show()
print ("################")

#B. Train and Evaluate Features with simple logistic regression ON ONE LABEL
#1. Simple evaluation methodology : train and test split
(train,test)=dataDF.rdd.randomSplit([0.8,0.2],seed=42)


num_features=20
train_df=train.toDF()
test_df=test.toDF()

# Def du Test de Chi2
# HTML TITLE
train_df, test_df = chi_2("tf_idf_title", "selected_features_html_title", "html", 100, train_df, test_df)
train_df, test_df = pca("selected_features_html_title", "selected_features_html_PCA_title", 50, train_df, test_df)

#HTML BODY
train_df, test_df = chi_2("tf_idf_body", "selected_features_html_body", "html", 100, train_df, test_df)
train_df, test_df = pca("selected_features_html_body", "selected_features_html_PCA_body", 50, train_df, test_df)

# JAVASCRIPT ASSEMBLER
from pyspark.ml.feature import VectorAssembler
# Combine selected features into a single vector column
assembler = VectorAssembler(inputCols=["selected_features_html_PCA_title", "selected_features_html_PCA_body"], outputCol="features_html")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

# JAVASCRIPT TITLE
train_df, test_df = chi_2("tf_idf_title", "selected_features_javascript_title", "javascript", 100, train_df, test_df)
train_df, test_df = pca("selected_features_javascript_title", "selected_features_javascript_PCA_title", 50, train_df, test_df)

# JAVASCRIPT BODY
train_df, test_df = chi_2("tf_idf_body", "selected_features_javascript_body", "javascript", 100, train_df, test_df)
train_df, test_df = pca("selected_features_javascript_body", "selected_features_javascript_PCA_body", 50, train_df, test_df)

# JAVASCRIPT ASSEMBLER
assembler = VectorAssembler(inputCols=["selected_features_javascript_PCA_title", "selected_features_javascript_PCA_body"], outputCol="features_javascript")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)


# JQUERY TITLE
train_df, test_df = chi_2("tf_idf_title", "selected_features_jquery_title", "jquery", 100, train_df, test_df)
train_df, test_df = pca("selected_features_jquery_title", "selected_features_jquery_PCA_title", 50, train_df, test_df)

# JQUERY BODY
train_df, test_df = chi_2("tf_idf_body", "selected_features_jquery_body", "jquery", 100, train_df, test_df)
train_df, test_df = pca("selected_features_jquery_body", "selected_features_jquery_PCA_body", 50, train_df, test_df)

# JQUERY ASSEMBLER
assembler = VectorAssembler(inputCols=["selected_features_jquery_PCA_title", "selected_features_jquery_PCA_body"], outputCol="features_jquery")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

# CSS TITLE
train_df, test_df = chi_2("tf_idf_title", "selected_features_css_title", "css", 100, train_df, test_df)
train_df, test_df = pca("selected_features_css_title", "selected_features_css_PCA_title", 50, train_df, test_df)
# CSS BODY
train_df, test_df = chi_2("tf_idf_body", "selected_features_css_body", "css", 100, train_df, test_df)
train_df, test_df = pca("selected_features_css_body", "selected_features_css_PCA_body", 50, train_df, test_df)

# CSS ASSEMBLER
assembler = VectorAssembler(inputCols=["selected_features_css_PCA_title", "selected_features_css_PCA_body"], outputCol="features_css")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)


################


In [None]:
train_df.write.parquet('/content/drive/MyDrive/DSSP/DSSP_2024-02-09_FinalDataCamp/train_df2.parquet')
test_df.write.parquet('/content/drive/MyDrive/DSSP/DSSP_2024-02-09_FinalDataCamp/test_df2.parquet')

In [None]:
weight_t=2
weight_b=1
train_df= train_df.withColumn("selected_features_html_PCA_title_W",train_df.selected_features_html_PCA_title * weight_t)
train_df =train_df.withColumn("selected_features_html_PCA_body_W",train_df.selected_features_html_PCA_body * weight_b)
test_df= test_df.withColumn("selected_features_html_PCA_title_W",train_df.selected_features_html_PCA_title * weight_t)
test_df =test_df.withColumn("selected_features_html_PCA_body_W",train_df.selected_features_html_PCA_body * weight_b)


assembler = VectorAssembler(inputCols=["selected_features_html_PCA_title_W", "selected_features_html_PCA_body_W"], outputCol="features_html_W")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

weight_t=2
weight_b=1

train_df.withColumn("selected_features_javascript_PCA_title_W",train_df.selected_features_javascript_PCA_title * weight_title_javascript)
assembler = VectorAssembler(inputCols=["selected_features_javascript_PCA_title_W", "selected_features_javascript_PCA_body"], outputCol="features_javascript_W")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

weight_t=2
weight_b=1

train_df.withColumn("selected_features_jquery_PCA_title_W",train_df.selected_features_jquery_PCA_title * weight_title_jquery)
assembler = VectorAssembler(inputCols=["selected_features_jquery_PCA_title_W", "selected_features_jquery_PCA_body"], outputCol="features_jquery_W")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)


weight_t=2
weight_b=1

train_df.withColumn("selected_features_css_PCA_title_W",train_df.selected_features_css_PCA_title * weight_title_css)
assembler = VectorAssembler(inputCols=["selected_features_css_PCA_title_W", "selected_features_css_PCA_body"], outputCol="features_css_W")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)



AnalysisException: [DATATYPE_MISMATCH.BINARY_OP_DIFF_TYPES] Cannot resolve "(selected_features_html_PCA_title * 2)" due to data type mismatch: the left and right operands of the binary operator have incompatible types ("STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>" and "INT").;
'Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 13 more fields]
+- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 12 more fields]
   +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 11 more fields]
      +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 10 more fields]
         +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 9 more fields]
            +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 8 more fields]
               +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 7 more fields]
                  +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 6 more fields]
                     +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 5 more fields]
                        +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 4 more fields]
                           +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 3 more fields]
                              +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, ... 2 more fields]
                                 +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, selected_features_javascript_body#844, UDF(selected_features_javascript_body#844) AS selected_features_javascript_PCA_body#908]
                                    +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, selected_features_javascript_PCA_title#728, UDF(tf_idf_body#208) AS selected_features_javascript_body#844]
                                       +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, selected_features_javascript_title#668, UDF(selected_features_javascript_title#668) AS selected_features_javascript_PCA_title#728]
                                          +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, features_html#566, UDF(tf_idf_title#207) AS selected_features_javascript_title#668]
                                             +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, selected_features_html_PCA_body#514, UDF(struct(selected_features_html_PCA_title, selected_features_html_PCA_title#359, selected_features_html_PCA_body, selected_features_html_PCA_body#514)) AS features_html#566]
                                                +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, selected_features_html_body#460, UDF(selected_features_html_body#460) AS selected_features_html_PCA_body#514]
                                                   +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, selected_features_html_PCA_title#359, UDF(tf_idf_body#208) AS selected_features_html_body#460]
                                                      +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, selected_features_html_title#309, UDF(selected_features_html_title#309) AS selected_features_html_PCA_title#359]
                                                         +- Project [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208, UDF(tf_idf_title#207) AS selected_features_html_title#309]
                                                            +- LogicalRDD [id#193, title#194, body#195, array_tags#196, javascript#197L, css#198L, jquery#199L, html#200L, title_no_punctuation#201, body_no_punctuation#202, words_title#203, words_body#204, tf_title#205, tf_body#206, tf_idf_title#207, tf_idf_body#208], false


In [None]:
train_df.write.parquet('/content/drive/MyDrive/DSSP/DSSP_2024-02-09_FinalDataCamp/train_dfW2.parquet')
test_df.write.parquet('/content/drive/MyDrive/DSSP/DSSP_2024-02-09_FinalDataCamp/test_dfW2.parquet')

In [None]:

logistic_html = LogisticRegression(featuresCol="features_html_W", labelCol="html", predictionCol="html_pred", rawPredictionCol="html_pred_raw", maxIter=10)
logistic_javascript=LogisticRegression(featuresCol="features_javascript_W",labelCol="javascript",predictionCol='javascript_pred',rawPredictionCol="javascript_pred_raw",maxIter=10)
logistic_jquery=LogisticRegression(featuresCol="features_jquery_W",labelCol="jquery",predictionCol='jquery_pred',rawPredictionCol="jquery_pred_raw",maxIter=10)
logistic_css=LogisticRegression(featuresCol="features_css_W",labelCol="css",predictionCol='css_pred',rawPredictionCol="css_pred_raw",maxIter=10)

#2.initialize model parameters ...we use a simple model here
from pyspark.ml.classification import LogisticRegression

# sur les données réduites:

#3. Fit the model
lrModel_html = logistic_html.fit(train_df)
lrModel_javascript = logistic_javascript.fit(train_df)
lrModel_jquery = logistic_jquery.fit(train_df)
lrModel_css = logistic_css.fit(train_df)

#4.Apply model to test data

result=lrModel_html.transform(test_df)
result=result.withColumnRenamed('probability','probability_html')
result=lrModel_javascript.transform(result)
result=result.withColumnRenamed('probability','probability_javascript')
result=lrModel_jquery.transform(result)
result=result.withColumnRenamed('probability','probability_jquery')
result=lrModel_css.transform(result)
result=result.withColumnRenamed('probability','probability_css')
print(result.schema.names)


#5. Evaluation of results
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator_html = BinaryClassificationEvaluator(rawPredictionCol="html_pred_raw",labelCol='html',metricName="areaUnderPR",)
evaluator_javascript = BinaryClassificationEvaluator(rawPredictionCol="javascript_pred_raw",labelCol='javascript',metricName="areaUnderPR",)
evaluator_jquery = BinaryClassificationEvaluator(rawPredictionCol="jquery_pred_raw",labelCol='jquery',metricName="areaUnderPR",)
evaluator_css = BinaryClassificationEvaluator(rawPredictionCol="css_pred_raw",labelCol='css',metricName="areaUnderPR",)


print ("RESULT of classifier for HTML label")
print (evaluator_html.evaluate(result))

print ("RESULT of classifier for javascript label")
print (evaluator_javascript.evaluate(result))


print ("RESULT of classifier for jquery label")
print (evaluator_jquery.evaluate(result))

print ("RESULT of classifier for css label")
print (evaluator_css.evaluate(result))

print ("Probabilités de prédiction pour html : ",result.select('html','probability_html','javascript','probability_javascript','jquery','probability_jquery','css','probability_css').show())

#%%

print ("################")


#%%

#DO THE SAME FOR ALL THE LABELS

#C. Multi-label evaluation
#ASSUMING all labels predicted evaluate the multi-label task

#1. example of how to transform the prediction columns into one column of labels
#return new rows with original list of labels in one column and the predicted ones in the other
def predictions(row):
	data = row.asDict()
	labels=data['array_tags']
	predicted=[]
	for tag in [u'javascript_pred', u'css_pred', u'jquery_pred', u'html_pred']:
		if tag in data  and data[tag]==1:
			predicted.append(tag.split('_')[0])
	ret={'id':data['id'],'labels':labels,'predicted':predicted}
	newRow = Row(*ret.keys())
	newRow = newRow(*ret.values())
	return newRow

to_evaluate=result.rdd.map(predictions)


#2.We define here a metric to evaluate all predicted tags simultaneously

def F1_multilabel(x):
	 predicted=set(x['predicted'])
	 correct=set(x['labels'])
	 predicted_correct=len(predicted.intersection(correct))
	 return 2*predicted_correct/float(len(correct)+len(predicted))


print (to_evaluate.map(F1_multilabel).mean())
print ("################")

['id', 'title', 'body', 'array_tags', 'javascript', 'css', 'jquery', 'html', 'title_no_punctuation', 'body_no_punctuation', 'words_title', 'words_body', 'tf_title', 'tf_body', 'tf_idf_title', 'tf_idf_body', 'selected_features_html_title', 'selected_features_html_PCA_title', 'selected_features_html_body', 'selected_features_html_PCA_body', 'features_html', 'selected_features_javascript_title', 'selected_features_javascript_PCA_title', 'selected_features_javascript_body', 'selected_features_javascript_PCA_body', 'features_javascript', 'selected_features_jquery_title', 'selected_features_jquery_PCA_title', 'selected_features_jquery_body', 'selected_features_jquery_PCA_body', 'features_jquery', 'selected_features_css_title', 'selected_features_css_PCA_title', 'selected_features_css_body', 'selected_features_css_PCA_body', 'features_css', 'html_pred_raw', 'probability_html', 'html_pred', 'javascript_pred_raw', 'probability_javascript', 'javascript_pred', 'jquery_pred_raw', 'probability_jque