In [1]:
import numpy as np
import pandas as pd
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql import functions as f

spark = SparkSession.builder.enableHiveSupport().getOrCreate()

# Suppresses user warinig messages in Python
import warnings
warnings.simplefilter("ignore", UserWarning)

# Suppresses `WARN` messages in JVM
spark.sparkContext.setLogLevel("ERROR")

In [2]:
import importlib
ptesting_model = importlib.import_module("ptesting-model")

In [3]:
expected_input_cols = [                                                                                                                                                                                                                       
    'author',
    'case when length(sha) > 0 then sha else sha(string(random())) end sha',
    'commit_date',
    'array_distinct(failed_tests) failed_tests',
    'files' 
]

log_data_df = spark.read.format('json').load('../models/spark/logs/github-logs.json').selectExpr(expected_input_cols)

In [4]:
log_data_df.show()

+------------------+--------------------+-------------------+--------------------+--------------------+
|            author|                 sha|        commit_date|        failed_tests|               files|
+------------------+--------------------+-------------------+--------------------+--------------------+
|       ChenMichael|81d459196c22ef868...|2021/09/17 16:32:01|                  []|[{{2, 3, 1, sql/c...|
|          f-thiele|5ca6ad9afa3467b71...|2021/09/16 15:24:44|[pyspark.mllib.te...|[{{11, 17, 6, cor...|
|            viirya|8db8b50e0621b46e6...|2021/09/22 18:49:34|[org.apache.spark...|[{{19, 23, 4, sql...|
|            viirya|82ccaf18d64f46ffe...|2021/09/20 21:46:52|[org.apache.spark...|[{{1, 2, 1, sql/c...|
|            viirya|ba4172076f3f80305...|2021/07/04 06:37:05|[pyspark.pandas.t...|[{{1, 1, 0, sql/c...|
|            viirya|8678320d43e39c78c...|2021/04/15 01:29:22|                  []|[{{3, 6, 3, sql/c...|
|         karenfeng|259310f2df356f9a6...|2021/06/09 21:22:58|   

In [5]:
import json
from pathlib import Path 
excluded_tests = json.loads(Path('../models/spark/logs/excluded-tests.json').read_text())
log_data_df = ptesting_model._exclude_tests_from(log_data_df, excluded_tests)

In [6]:
from ptesting import github_utils

test_files = json.loads(Path('../models/spark/indexes/latest/test-files.json').read_text())
commits = json.loads(Path('../models/spark/logs/commits.json').read_text())
commits = list(map(lambda c: github_utils.from_github_datetime(c[0]), commits))
updated_file_stats = json.loads(Path('../models/spark/logs/updated-file-stats.json').read_text())
failed_tests = json.loads(Path('../models/spark/failed-tests.json').read_text())
contributor_stats = json.loads(Path('../models/spark/logs/contributor-stats.json').read_text())
dep_graph = json.loads(Path('../models/spark/indexes/latest/dep-graph.json').read_text())

to_train_features, _ = ptesting_model._create_train_test_pipeline(spark, test_files, commits, dep_graph, updated_file_stats, contributor_stats, failed_tests)
pdf = to_train_features.__call__(log_data_df).toPandas()

In [7]:
from ptesting import train

X = pdf[pdf.columns[pdf.columns != 'failed']]
y = pdf['failed']
X, y = train.rebalance_training_data(X, y, coeff=1.0)
X['failed'] = y
pdf = X
X = pdf[pdf.columns[pdf.columns != 'failed']]
y = pdf['failed']

2021-10-19 23:18:55.582 INFO train: Sampling training data (strategy={0: 861, 1: 861}): {0: 117277, 1: 861} => {0: 861, 1: 861}


In [8]:
X.describe()

Unnamed: 0,num_commits,updated_num_3d,updated_num_14d,updated_num_56d,updated_num_3c,updated_num_14c,updated_num_56c,num_adds,num_dels,num_chgs,file_card,failed_num_7d,failed_num_14d,failed_num_28d,failed_num_7c,failed_num_14c,failed_num_28c,total_failed_num,path_difference,distance
count,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0
mean,103.85482,3.363531,8.544715,16.291521,0.350174,0.529036,4.009872,897.251452,267.829849,1165.081301,15.664344,0.527875,0.569106,0.631243,0.004646,0.009872,0.022648,1.296748,3.675958,56.979675
std,145.833822,4.024361,7.826305,13.218445,2.02825,2.148341,3.800275,7159.458195,1101.699746,7468.16299,60.802191,0.556595,0.598408,0.693139,0.068021,0.110021,0.167208,1.548248,3.93388,62.7009
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,5.0,0.0,0.0,0.0,48.0,7.0,55.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0
50%,83.0,1.0,6.0,19.0,0.0,0.0,3.0,80.0,9.0,91.0,4.0,0.5,1.0,1.0,0.0,0.0,0.0,1.0,3.5,2.0
75%,83.0,8.0,18.0,20.0,0.0,0.0,8.0,184.0,55.0,257.0,7.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,128.0
max,634.0,29.0,29.0,100.0,29.0,29.0,29.0,130829.0,9386.0,132030.0,793.0,3.0,4.0,7.0,1.0,2.0,3.0,12.0,128.0,128.0


In [9]:
import altair as alt

charts = []

for c in X.columns:
    charts.append(alt.Chart(X).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)

In [10]:
import altair as alt

target = 'distance'
target_min, target_max = 0, 10

alt.Chart(X[X[target].between(target_min, target_max)]).mark_bar().encode(
    x=alt.X(target, scale=alt.Scale(domain=[target_min, target_max])),
    y=alt.Y('count()', axis=alt.Axis(title='freq'))
).properties(width=400, height=400)

In [11]:
from sklearn import preprocessing

mm = preprocessing.MinMaxScaler()
norm_X = mm.fit_transform(X)

norm_pdf = pd.DataFrame(norm_X, columns=X.columns)
norm_pdf['failed'] = y

In [12]:
import altair as alt

x_axis = alt.X('path_difference', scale=alt.Scale(domain=[0.0, 1.0]))
y_axis = alt.X('total_failed_num', scale=alt.Scale(domain=[0.0, 1.0]))
color=alt.Color('failed:N', scale=alt.Scale(range=['blue', 'red']))

alt.Chart(norm_pdf).mark_point().encode(x=x_axis, y=y_axis, color=color).properties(width=600, height=400).interactive()

In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
_X = X[['failed_num_14d', 'updated_num_56c', 'distance', 'num_commits']]
_X = pca.fit_transform(_X)

In [14]:
import altair as alt

_X = pd.DataFrame({'PCA-X': _X[:, 0], 'PCA-Y': _X[:, 1], 'failed': y})
alt.Chart(_X).mark_point().encode(x='PCA-X', y='PCA-Y', color='failed:N').properties(width=600, height=400).interactive()

In [15]:
sampled_pdf = pdf.sample(n=100, random_state=0)
sampled_pdf

Unnamed: 0,num_commits,updated_num_3d,updated_num_14d,updated_num_56d,updated_num_3c,updated_num_14c,updated_num_56c,num_adds,num_dels,num_chgs,...,failed_num_7d,failed_num_14d,failed_num_28d,failed_num_7c,failed_num_14c,failed_num_28c,total_failed_num,path_difference,distance,failed
465,0,0,4,18,0,0,0,39,0,39,...,0,0,0,0,0,0,1,2,2,0
124,266,0,22,27,0,1,6,1155,25,1180,...,0,1,1,0,0,0,1,6,2,0
646,190,0,0,0,0,0,0,587,435,1022,...,0,0,0,0,0,0,4,2,1,0
249,279,0,2,12,0,0,2,30,8,38,...,0,0,0,0,0,0,0,6,2,0
1594,83,8,18,20,0,0,8,48,7,55,...,1,1,1,0,0,0,1,7,128,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,0,1,4,4,1,1,1,167,30,197,...,0,0,0,0,0,0,1,6,2,0
1658,0,2,11,16,0,1,4,1,1,2,...,1,1,1,0,0,0,1,5,128,1
1542,83,8,18,20,0,0,8,48,7,55,...,1,1,1,0,0,0,1,8,128,1
233,0,0,1,11,0,0,0,193,43,236,...,0,0,0,0,0,0,1,1,2,0


In [16]:
# One of non-linear embedding in sklearn
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
tf = tsne.fit_transform(sampled_pdf)
tsne.kl_divergence_

0.08989322185516357

In [17]:
import altair as alt

_X = pd.DataFrame({'tSNE-X': tf[:, 0], 'tSNE-Y': tf[:, 1], 'failed': sampled_pdf['failed']})
alt.Chart(_X).mark_point().encode(x='tSNE-X', y='tSNE-Y', color='failed:N').properties(width=600, height=400).interactive()

In [18]:
cols = ['failed_num_14d', 'updated_num_56c', 'distance', 'num_commits']

alt.Chart(norm_pdf.sample(n=500, random_state=0)).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='failed:N'
).properties(width=200, height=200).repeat(row=cols, column=cols)

In [19]:
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
rf.fit(X, y)
print('SCORE with ALL Features: %1.2f\n' % rf.score(X, y))

rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
fs = BorutaPy(rf, n_estimators='auto', random_state=0)
fs.fit(X.values, y.values)

selected = fs.support_
print('Selected Features: %s' % ','.join(X.columns[selected]))

X_selected = X[X.columns[selected]]
rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
rf.fit(X_selected, y)
print('SCORE with selected Features: %1.2f' % rf.score(X_selected, y))

SCORE with ALL Features: 0.97

Selected Features: updated_num_14d,num_dels,num_chgs,failed_num_7d,failed_num_14d,failed_num_28d,failed_num_28c,total_failed_num,path_difference,distance
SCORE with selected Features: 0.97
