In [1]:
import findspark
findspark.init()

In [2]:
# imprt spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

spark

22/11/04 23:00:14 WARN Utils: Your hostname, pc resolves to a loopback address: 127.0.1.1; using 192.168.170.52 instead (on interface wlp3s0)
22/11/04 23:00:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/04 23:00:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# read and load data to spark
df = spark.read.json("/home/sbn/Downloads/data/archive/arxiv-metadata-oai-snapshot.json")
df.printSchema()

                                                                                

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



In [4]:
df.rdd.getNumPartitions()

25

In [5]:
# Create a new schema
from pyspark.sql.types import *

Schema = StructType([
    
    StructField('authors', StringType(), True),
    StructField('categories', StringType(), True),
    StructField('license', StringType(), True),
    StructField('comments', StringType(), True),
    StructField('abstract', StringType(), True),
    StructField('versions', ArrayType(StringType()), True),
    
])

print(Schema)

StructType([StructField('authors', StringType(), True), StructField('categories', StringType(), True), StructField('license', StringType(), True), StructField('comments', StringType(), True), StructField('abstract', StringType(), True), StructField('versions', ArrayType(StringType(), True), True)])


In [6]:
# Bind data to schema

df = spark.read.json("/home/sbn/Downloads/data/archive/arxiv-metadata-oai-snapshot.json", schema=Schema)

df.show()

+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|             authors|       categories|             license|            comments|            abstract|            versions|
+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|C. Bal\'azs, E. L...|           hep-ph|                null|37 pages, 15 figu...|  A fully differe...|[{"version":"v1",...|
|Ileana Streinu an...|    math.CO cs.CG|http://arxiv.org/...|To appear in Grap...|  We describe a n...|[{"version":"v1",...|
|         Hongjun Pan|   physics.gen-ph|                null| 23 pages, 3 figures|  The evolution o...|[{"version":"v1",...|
|        David Callan|          math.CO|                null|            11 pages|  We show that a ...|[{"version":"v1",...|
|Wael Abu-Shammala...|  math.CA math.FA|                null|                null|  In this paper w...|[{"version":"v1",...|


In [7]:
# Missing values 

df = df.dropna(subset = ["comments"])

df = df.fillna(value = "unknown", subset = ["license"])

df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             authors|          categories|             license|            comments|            abstract|            versions|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|C. Bal\'azs, E. L...|              hep-ph|             unknown|37 pages, 15 figu...|  A fully differe...|[{"version":"v1",...|
|Ileana Streinu an...|       math.CO cs.CG|http://arxiv.org/...|To appear in Grap...|  We describe a n...|[{"version":"v1",...|
|         Hongjun Pan|      physics.gen-ph|             unknown| 23 pages, 3 figures|  The evolution o...|[{"version":"v1",...|
|        David Callan|             math.CO|             unknown|            11 pages|  We show that a ...|[{"version":"v1",...|
|Y. H. Pong and C....|   cond-mat.mes-hall|             unknown|6 pages, 4 figure...|  We study the tw..

In [8]:
# get the author names who published a paper in a 'math' category

df.createOrReplaceTempView("Archive")

sql_query = """
                SELECT authors FROM Archive
                WHERE categories LIKE 'math%'
"""

spark.sql(sql_query).show()
print(spark.sql(sql_query).count())

+--------------------+
|             authors|
+--------------------+
|Ileana Streinu an...|
|        David Callan|
|  Sergei Ovchinnikov|
|Clifton Cunningha...|
|        Koichi Fujii|
|         Norio Konno|
|Simon J.A. Malham...|
|Robert P. C. de M...|
|  P\'eter E. Frenkel|
|          Mihai Popa|
|   Debashish Goswami|
|      Mikkel {\O}bro|
|Nabil L. Youssef,...|
|         Boris Rubin|
|         A. I. Molev|
| Branko J. Malesevic|
|   John W. Robertson|
|     Yu.N. Kosovtsov|
|        Osamu Fujino|
|Stephen C. Power ...|
+--------------------+
only showing top 20 rows





304590


                                                                                

In [9]:
# get licenses with 5 or more letters in the abstract

sql_query = """
                SELECT distinct(license) FROM Archive
                WHERE abstract REGEXP "%\(([A-Za-z][^_ /\\<>]{5,})\)%"
"""

spark.sql(sql_query).show()



+--------------------+
|             license|
+--------------------+
|http://arxiv.org/...|
|http://creativeco...|
|http://creativeco...|
|http://creativeco...|
|             unknown|
+--------------------+



                                                                                

In [10]:
# Extract the statistic of the number of pages for unknown licenses

import re

def get_no_of_pages(line):
    search = re.findall('\d+ pages', line)
    
    if search:
        return int(search[0].split(" ")[0])
    else:
        return 0
    
spark.udf.register("page_numbers", get_no_of_pages)

sql_query = """
            SELECT AVG(page_numbers(comments)) as avg,
            SUM(page_numbers(comments)) AS sum,
            STD(page_numbers(comments)) AS std
            FROM Archive
            WHERE license = "unknown"
""" 

spark.sql(sql_query).show()



+------------------+---------+------------------+
|               avg|      sum|               std|
+------------------+---------+------------------+
|13.368011068572079|5642584.0|16.777518213632323|
+------------------+---------+------------------+



                                                                                