In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install spark-xml

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [37]:
import findspark

findspark.init()

import re
from typing import List
from datetime import datetime
import pyspark.sql as sql
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, DateType
from pyspark.sql.functions import udf, explode, rank
from pyspark.sql.functions import col, max, sum, desc, countDistinct
from pyspark.sql.functions import lower
from pyspark.sql.functions import col, split

spark = SparkSession \
    .builder \
    .appName("L2_reports_with_apache_spark") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.13.0")\
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [4]:
posts_sample = spark.read \
    .format("xml") \
    .options(rowTag="row") \
    .load('posts_sample.xml')


In [5]:
posts_sample.take(2)

[Row(_AcceptedAnswerId=7, _AnswerCount=13, _Body="<p>I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I build the application, it gives the following error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type <code>'decimal'</code> to <code>'double'</code></p>\n</blockquote>\n\n<p>I tried using <code>trans</code> and <code>double</code> but then the control doesn't work. This code worked fine in a past VB.NET project.</p>\n", _ClosedDate=None, _CommentCount=2, _CommunityOwnedDate=datetime.datetime(2012, 10, 31, 16, 42, 47, 213000), _CreationDate=datetime.datetime(2008, 7, 31, 21, 42, 52, 667000), _FavoriteCount=48, _Id=4, _LastActivityDate=datetime.datetime(2019, 7, 19, 1, 39, 54, 173000), _LastEditDate=datetime.datetime(2019, 7, 19, 1, 39, 54, 173000), _LastEditorDisplayName='Rich B', _LastEditorUserId=3641067, _OwnerDisplayName=None, _OwnerUs

In [12]:
program_lang = spark.read.csv('programming-languages.csv', header=True, sep=",")
program_lang.show(5)
program_lang.printSchema()


+----------+--------------------+
|      name|       wikipedia_url|
+----------+--------------------+
|   A# .NET|https://en.wikipe...|
|A# (Axiom)|https://en.wikipe...|
|A-0 System|https://en.wikipe...|
|        A+|https://en.wikipe...|
|       A++|https://en.wikipe...|
+----------+--------------------+
only showing top 5 rows

root
 |-- name: string (nullable = true)
 |-- wikipedia_url: string (nullable = true)



In [31]:
#преобразуем в список для более удобного поиска

all_languages = program_lang.select("name") \
    .rdd.map(lambda lang: lang[0]) \
    .map(lambda lang: str(lang)) \
    .map(lambda lang: lang.lower()) \
    .collect()

In [50]:
# обработка постов и поиск языков из списка
yearLanguageTags = posts_sample.rdd \
    .map(lambda row: (row[6], row[18])) \
    .filter(lambda row: row[0] is not None and row[1] is not None) \
    .map(lambda row: (str(row[0]), str(row[1]))) \
    .map(lambda row: (row[0][:4], row[1].split(">"))) \
    .flatMap(lambda row: [(row[0], tag.replace("<", "")) for tag in row[1]]) \
    .filter(lambda row: row[1] in all_languages)

#создание отчета для каждого года
years = list(map(str, range(2010, 2021)))
report = []
for reportYear in years:
    yearReport = yearLanguageTags.filter(lambda row: row[0] == reportYear) \
        .map(lambda row: (row[1], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda row: (reportYear, row[0], row[1])) \
        .sortBy(lambda row: row[2], ascending=False) \
        .take(10)
    report.extend(yearReport)

finalReport = spark.createDataFrame(report, ["Year", "Language", "Mention_count"])
finalReport.show()

+----+-----------+-------------+
|Year|   Language|Mention_count|
+----+-----------+-------------+
|2010|       java|           52|
|2010|        php|           46|
|2010| javascript|           44|
|2010|     python|           26|
|2010|objective-c|           23|
|2010|          c|           20|
|2010|       ruby|           12|
|2010|     delphi|            8|
|2010|applescript|            3|
|2010|          r|            3|
|2011|        php|          102|
|2011|       java|           93|
|2011| javascript|           83|
|2011|     python|           37|
|2011|objective-c|           34|
|2011|          c|           24|
|2011|       ruby|           20|
|2011|       perl|            9|
|2011|     delphi|            8|
|2011|       bash|            7|
+----+-----------+-------------+
only showing top 20 rows



In [49]:
finalReport.write.format("parquet").save("lang_report")