In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

conf = SparkConf()
conf.set("spark.jars", "./neo4j-connector-apache-spark_2.12-5.0.1_for_spark_3.jar, ./mongo-spark-connector_2.11-2.0.0.jar")
conf.set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0")
conf.set("spark.mongodb.read.connection.uri", "mongodb://localhost:27017")

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('rede-devs') \
    .config(conf=conf) \
    .getOrCreate()

In [142]:
# QUERY 1

devCode = '850'

query = """
    MATCH (dev_buscado:Developer {code: '"""+ devCode +"""'})
    MATCH(dev_buscado)-[]->()<-[r]-(dev_encontrados:Developer)
    WITH count(r) as num_relacoes, dev_encontrados, dev_buscado
    WHERE num_relacoes > 12
    RETURN COLLECT(dev_encontrados) as encontrados, num_relacoes, dev_buscado
 """

query1 = spark.read.format("org.neo4j.spark.DataSource") \
    .option("url", "bolt://localhost:7687") \
    .option("authentication.type", "basic") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "password") \
    .option("query", query) \
    .option("partitions", "1") \
    .load() \
    .limit(1)

queryResult = query1.collect()[0]

queryResult.encontrados

codes = []

for similar in queryResult.encontrados:
    codes.append(similar.code)

pipeline = [
    {
        "$match": {
            "Code": {
                "$in": codes
            }
        }
    },
    {
      "$group": { 
           "_id": "null", 
           "ConvertedSalary": { "$avg": "$ConvertedSalary" },
           "Exercise": { "$avg": "$Exercise" },
           "YearsCoding": { "$avg": "$YearsCoding" },
           "YearsCodingProf": { "$avg": "$YearsCodingProf" },
           "HoursComputer": { "$avg": "$HoursComputer" },
           "HoursOutside": { "$avg": "$HoursOutside" }
      }
   }
]

result = spark.read.format("mongodb") \
  .option("spark.mongodb.read.database", "rede-devs") \
  .option("spark.mongodb.read.collection", "developers") \
  .option("spark.mongodb.read.aggregation.pipeline", pipeline) \
  .load() \
  .select('ConvertedSalary', 'Exercise', 'YearsCoding', 'YearsCodingProf', 'HoursComputer', 'HoursOutside') \
  .show()

+---------------+--------+-----------+---------------+-----------------+------------------+
|ConvertedSalary|Exercise|YearsCoding|YearsCodingProf|    HoursComputer|      HoursOutside|
+---------------+--------+-----------+---------------+-----------------+------------------+
|        19165.9|     1.9|     9.0625|            6.8|9.857142857142858|1.4115384615384616|
+---------------+--------+-----------+---------------+-----------------+------------------+



In [143]:
# QUERY 2

devCode = '44'

query = """
    MATCH (dev_buscado:Developer {code: '"""+ devCode +"""'})
    MATCH(dev_buscado)-[]->()<-[r]-(dev_encontrados:Developer)
    WITH count(r) as num_relacoes, dev_encontrados, dev_buscado
    WHERE num_relacoes > 12
    RETURN COLLECT(dev_encontrados) as encontrados, num_relacoes, dev_buscado
 """

query2 = spark.read.format("org.neo4j.spark.DataSource") \
    .option("url", "bolt://localhost:7687") \
    .option("authentication.type", "basic") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "password") \
    .option("query", query) \
    .option("partitions", "1") \
    .load() \
    .limit(1)

def getUserProfiles(codes):
    pipeline = [
        {
            "$match": {
                "Code": {
                    "$in": codes
                }
            }
        }
    ]

    return spark.read.format("mongodb") \
    .option("spark.mongodb.read.database", "rede-devs") \
    .option("spark.mongodb.read.collection", "developers") \
    .option("spark.mongodb.read.aggregation.pipeline", pipeline) \
    .load() \
    .select('Name', 'Code', 'Age', 'YearsCoding', 'YearsCodingProf') \
    .collect()

queryResult = query2.collect()[0]

queryResult.encontrados

codes = []

for similar in queryResult.encontrados:
    codes.append(similar.code)

candidatos = getUserProfiles(codes)
candidatos

[Row(Name='Victoria Lambert', Code='55', Age=30, YearsCoding=10, YearsCodingProf=4),
 Row(Name='Cameron Salinas', Code='81', Age=40, YearsCoding=10, YearsCodingProf=10),
 Row(Name='Adrienne Smith', Code='289', Age=40, YearsCoding=19, YearsCodingProf=16),
 Row(Name='Joseph Wolfe', Code='298', Age=40, YearsCoding=19, YearsCodingProf=16),
 Row(Name='Willie Owens', Code='343', Age=None, YearsCoding=16, YearsCodingProf=13),
 Row(Name='Brenda Eaton', Code='352', Age=30, YearsCoding=16, YearsCodingProf=10),
 Row(Name='Dan Swanson', Code='360', Age=40, YearsCoding=7, YearsCodingProf=7),
 Row(Name='Ashley Lambert', Code='551', Age=30, YearsCoding=13, YearsCodingProf=7),
 Row(Name='Angela Barnes', Code='580', Age=None, YearsCoding=4, YearsCodingProf=None),
 Row(Name='Matthew Small', Code='623', Age=30, YearsCoding=4, YearsCodingProf=1),
 Row(Name='Holly Mitchell', Code='628', Age=50, YearsCoding=22, YearsCodingProf=30),
 Row(Name='Katherine Brown', Code='701', Age=30, YearsCoding=4, YearsCodingP

In [139]:
# QUERY 3

query = """
    MATCH(d:Developer)-[:WORKED_WITH]->(lang:Language)<-[:WANTS_TO_LEARN]-(s:Developer)
    MATCH(d)-[:IS_FROM]->(country:Country)<-[:IS_FROM]-(s)
    WHERE NOT (d)-[:WANTS_TO_LEARN]->(lang)
    AND NOT (s)-[:WORKED_WITH]->(lang)
    WITH d, lang, country, COLLECT(s) AS students
    WHERE SIZE(students) >= 4
    MATCH (d)-[:WORKED_WITH]->(commonLang:Language)<-[:WORKED_WITH]-(student:Developer)
    WHERE student IN students
    RETURN d as teacher, lang, students[0..4] as estudantes, commonLang, country
"""

query3 = spark.read.format("org.neo4j.spark.DataSource") \
    .option("url", "bolt://localhost:7687") \
    .option("authentication.type", "basic") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "password") \
    .option("query", query) \
    .option("partitions", "1") \
    .load() \
    .limit(1)

def getUserProfiles(codes):
    pipeline = [
        {
            "$match": {
                "Code": {
                    "$in": codes
                }
            }
        }
    ]

    return spark.read.format("mongodb") \
    .option("spark.mongodb.read.database", "rede-devs") \
    .option("spark.mongodb.read.collection", "developers") \
    .option("spark.mongodb.read.aggregation.pipeline", pipeline) \
    .load() \
    .select('Name', 'Code', 'Age', 'YearsCoding', 'YearsCodingProf') \
    .collect()

queryResult = query3.collect()[0]

estudante_lambda = lambda element: element.code  

estudantes = getUserProfiles(list(map(estudante_lambda ,queryResult.estudantes)))

teacher = getUserProfiles([queryResult.teacher.code])

print('Grupo de estudos de: ' + queryResult.lang.language)
print('Professor(a): ' + str(teacher))
print('Estudantes: ')
for student in estudantes:
    print(student)

Grupo de estudos de: Ruby
Professor(a): [Row(Name='Kathryn Bentley', Code='842', Age=40, YearsCoding=16, YearsCodingProf=16)]
Estudantes: 
Row(Name='Kevin Richardson', Code='725', Age=23, YearsCoding=10, YearsCodingProf=4)
Row(Name='Lindsay Evans', Code='850', Age=None, YearsCoding=19, YearsCodingProf=19)
Row(Name='Henry Francis', Code='1102', Age=30, YearsCoding=1, YearsCodingProf=1)
Row(Name='Deanna Harris', Code='1377', Age=40, YearsCoding=22, YearsCodingProf=19)
