In [None]:
from sourced.spark import API as Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
.master("local[*]").appName("Examples")\
.getOrCreate()

engine = Engine(spark, "/repositories")

### Count the total of non-fork repositories

In [None]:
engine.repositories.filter("is_fork = false")\
.select("id").distinct()\
.agg(count("*").alias("count"))\
.select("count").show()

### Get all the files of all head commits

In [None]:
head_files = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.files\
.classify_languages()\
.filter("is_binary = false")\
.select("file_hash", "path", "content", "lang").filter("lang is not null").cache()

### Get the schema

In [None]:
head_files.printSchema()

### Print result

In [None]:
head_files.show()

### Top languages per number of files

In [None]:
top_ten_langs = head_files.distinct()\
.groupBy("lang").agg(count("*").alias("count"))\
.orderBy("count").sort(desc("count")).limit(20)\
.show()

### Get all Java files

In [None]:
 head_files.groupBy("lang").agg(count("*").alias("count")).filter("lang='Java'").show()

### Extract UASTs from all those files classfied as Python files in HEAD references

In [None]:
head_files.where('lang = "Python"').extract_uasts().select('file_hash', 'path', 'lang', 'uast').limit(10).show()