In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, avg, lit, udf, count, sum
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pandas
import numpy as np
import json
import requests
import os

from data_utils import get_date_range, get_data, clear_data
from datetime import datetime

In [2]:
spark = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(spark.sparkContext)

In [3]:
# Load data and group it by Language
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load('data_april2020.csv', schema='Repo STRING, Language STRING, Comment STRING, Forks_Count INTEGER, Stargazers_count INTEGER, Open_issues_count INTEGER, Date TIMESTAMP')
df.printSchema()

df.createOrReplaceTempView("events") # Create SQL table "events"

df = df.groupBy(col("Repo"))

df = df.agg(avg("Forks_Count").alias("Forks_Count"), count("Repo").alias("Count"), avg("Stargazers_count").alias("Average_Stargazers_count"))

df = df.sort(col("Count").asc())

root
 |-- Repo: string (nullable = true)
 |-- Language: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Forks_Count: integer (nullable = true)
 |-- Stargazers_count: integer (nullable = true)
 |-- Open_issues_count: integer (nullable = true)
 |-- Date: timestamp (nullable = true)



In [4]:
# Convert to Pandas for getting the plot
languages_grouped_df = df.toPandas().head(5)
languages_grouped_df

Unnamed: 0,Repo,Forks_Count,Count,Average_Stargazers_count
0,,,0,
1,addBpmnElement = false;,,1,
2,# + extraPathInfo - The part of the URL which ...,,1,
3,- {column_offset: 3 filter_name: F3 pupil: P3 ...,,1,
4,So I think the behavior that Pavel implemented...,,1,


In [5]:
# # Draw a plot
# cmap = plt.get_cmap('jet')
# colors = cmap(np.linspace(0, 1.0, 5))

# plt.rcParams["figure.figsize"] = (10, 5)
# plt.scatter(
#     x = languages_grouped_df["Forks_Count"],
#     y = languages_grouped_df["Average_Stargazers_count"],
#     s = languages_grouped_df["Count"]/8,
#     alpha = 0.5,
#     c = colors
# )

# # plt.title("Most popular languages by number of contributions")
# plt.xlabel("Forks")
# plt.ylabel("Average stargazers")
        
# x,y = languages_grouped_df["Forks_Count"], languages_grouped_df["Average_Stargazers_count"]


# #labels
# for i, name in enumerate(languages_grouped_df["Language"]):
#     plt.annotate(name, (x[i], y[i]))
#     print(name, (x[i], y[i]))
    
# plt.savefig('most_popular_languages.png')

In [6]:
companies_df = spark.sql("SELECT Repo, sum(Forks_Count) as Forks_Count, sum(Stargazers_count) as Stargazers_count, sum(Open_issues_count) as Open_issues_count FROM events WHERE Repo LIKE '%aws/%'  OR Repo LIKE '%google/%' OR Repo LIKE '%facebook/%' GROUP BY Repo ORDER BY Forks_Count ASC").dropna().show()

+--------------------+-----------+----------------+-----------------+
|                Repo|Forks_Count|Stargazers_count|Open_issues_count|
+--------------------+-----------+----------------+-----------------+
|aws/aws-xray-sdk-...|          0|               0|                0|
|     aws/aws-sdk-php|          0|               0|                0|
|       google/qkeras|          0|               0|                0|
|      aws/aws-sdk-js|          0|               0|                0|
|    google/syzkaller|          0|               5|                0|
|aws/aws-deep-lear...|          0|               0|                0|
|          google/rrg|          0|               0|                0|
|aws/aws-encryptio...|          0|               0|                0|
|aws/aws-xray-sdk-...|          0|               0|                0|
|       facebook/yoga|          0|               0|                0|
|aws/aws-parallelc...|          0|               0|                0|
|     google/cadviso

In [7]:
pullrequest_df = spark.sql("SELECT Repo, Comment, Language FROM events WHERE Repo LIKE '%amazon%'  OR Repo LIKE '%google%' OR Repo LIKE '%facebook%'").show()

+--------------------+-------------------------------------+----------+
|                Repo|                              Comment|  Language|
+--------------------+-------------------------------------+----------+
|https://developer...|                                 null|      null|
|When search on go...|                                 null|      null|
|google/web-storie...|                 Why is `width=0` ...|JavaScript|
|aws/amazon-chime-...|                 I chose types her...|TypeScript|
|googleads/googlea...|                 @ericleich thanks...|      Java|
|aws/amazon-chime-...|                 Good point will u...|TypeScript|
|Schmavery/faceboo...|                 The use case that...|JavaScript|
|amazonpay-labs/am...|                 ```updateButtonSt...|JavaScript|
|amazonpay-labs/am...|                 ```'#updateChecko...|JavaScript|
|amazonpay-labs/am...|こちら想定外エラーになると思うので...|JavaScript|
|google/web-storie...|                 I don't really li...|JavaScript|
|googleap