In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import numpy as np
import pandas as pd
from scipy import stats
import shelve
import csv

load_path = '/content/drive/My Drive/Word regression/word_regr_objects/'
out_path = "/content/drive/My Drive/Word regression"

# Metrics implementation

In [0]:
def load_word_regression_data(path, user_id):
  with shelve.open(path) as db:
    my_dict = db[str(user_id)]
  return my_dict

In [0]:
def get_common_words(A, B):
  intersect = []
  for item in A.keys():
      if item in B.keys():
          intersect.append(item)
  return intersect

def get_common_word_count(A, B):
  return len(get_common_words(A, B))

In [0]:
from gensim.matutils import jaccard_distance

def get_jaccard_similarity(A, B):
  return 1 - jaccard_distance(A, B)  # jaccard_similarity = 1 - jaccard_distance


def get_jaccard_dist(A, B):
  return jaccard_distance(A, B)

In [0]:
def get_overlap_ratio(A,B):
  lengthOf_A = len(A.keys())
  lengthOf_B = len(B.keys())

  if lengthOf_A == 0 and lengthOf_B == 0:
    return 0.0, 1.0

  jaccard_sim = get_jaccard_similarity(A.keys(), B.keys())

  lengthOf_overlap = get_common_word_count(A, B)
  overlap_ratio = lengthOf_overlap / ((lengthOf_A + lengthOf_B)/2)
  
  return overlap_ratio, jaccard_sim

In [0]:
from collections import Counter

def count_frequent_words(list_of_words):
  return Counter(list_of_words)

In [0]:
def get_changed_ratio(A, B):
  common_words_num = get_common_word_count(A,B)
  total = len(A.keys())
  changed_num = total - common_words_num
  changed_ratio = changed_num / total
  return changed_ratio

In [0]:
def get_keyword_num_ratio(A, B):
  if len(B.keys()) == 0:
    return "N/A"
  else:
    return len(A.keys())/len(B.keys())

def get_change_in_coefficients(A, B):
  common_words = get_common_words(A, B)
  coeff = []
  for word in common_words:
    change = B[word] - A[word]
    coeff.append(change)
  return np.array(coeff)

In [0]:
def get_stats_on_coeffChange(A, B):
  return stats.describe(get_change_in_coefficients(A, B))

def get_stats_on_all_coeff(A):
  return stats.describe(np.array(list(A.values())))

# RQ2: Overlap

This satisfies RQ2: Do  developers  build  similar  expertise  in  two  different  collaboratorplatforms, Stack Overflow and Github?  Is there transfer knowledge fromone platform to another?

In [0]:
def get_overlap(A_wordRegrObj_path, B_wordRegrObj_path):
  list_of_all_overlap_ratios = []
  list_of_all_jaccard_sims = []
  for user_i in range(0, 83550):
    if user_i % 10000 == 0:
      print(user_i)
    A_regr = load_word_regression_data(A_wordRegrObj_path, user_i)
    B_regr = load_word_regression_data(B_wordRegrObj_path, user_i)

    overlap_ratio, jaccard_sim = get_overlap_ratio(A_regr, B_regr)
    
    list_of_all_overlap_ratios.append(overlap_ratio)
    list_of_all_jaccard_sims.append(jaccard_sim)
  return np.asarray(list_of_all_overlap_ratios), np.asarray(list_of_all_jaccard_sims)

## Find out the overlap ratio in the GH_past - SO_past comparison

In [13]:
load_path = '/content/drive/My Drive/Word regression/word_regr_objects/'
A_path = load_path + "RQ2_GH_past_word_regression.shlf"
B_path = load_path + "RQ2_SO_past_word_regression.shlf"

overlap_ratios_np_arr1, jaccard_sim_np_arr1 = get_overlap(A_path, B_path)

overlap_ratio_stats1 = stats.describe(overlap_ratios_np_arr1)
jaccard_sim_stats1 = stats.describe(jaccard_sim_np_arr1)

0
10000
20000
30000
40000
50000
60000
70000
80000


In [0]:
print(overlap_ratio_stats1)
print(jaccard_sim_stats1)

DescribeResult(nobs=83550, minmax=(0.0, 1.0), mean=0.2881332983793958, variance=0.07420673211292472, skewness=1.344423571618871, kurtosis=0.9565861282399744)
DescribeResult(nobs=83550, minmax=(0.0, 1.0), mean=0.20907949640817658, variance=0.06844685208564713, skewness=2.026988946505363, kurtosis=3.3658878057576276)


## Find out the overlap ratio in the GH_recent - SO_recent comparison

In [0]:
load_path = "/content/drive/My Drive/Word regression/word_regr_objects/"
A_path = load_path + "RQ2_GH_recent_word_regression.shlf"
B_path = load_path + "RQ2_SO_recent_word_regression.shlf"

overlap_ratios_np_arr2, jaccard_sim_np_arr2 = get_overlap(A_path, B_path)

overlap_ratio_stats2 = stats.describe(overlap_ratios_np_arr2)
jaccard_sim_stats2 = stats.describe(jaccard_sim_np_arr2)

0
10000
20000
30000
40000
50000
60000
70000
80000


In [0]:
print(overlap_ratio_stats2)
print(jaccard_sim_stats2)

DescribeResult(nobs=83550, minmax=(0.0, 1.0), mean=0.3318606871326628, variance=0.11715920547125891, skewness=1.0572646463416364, kurtosis=-0.3746921998364092)
DescribeResult(nobs=83550, minmax=(0.0, 1.0), mean=0.26930786025849196, variance=0.12072818479150024, skewness=1.386437457289118, kurtosis=0.3188416472035356)


Write experiment results to file

In [0]:
df1 = pd.DataFrame()
df1["user_id"] = range(0, 83550)
df1["overlap_ratios"] = overlap_ratios_np_arr1
df1["jaccard_similarity"] = jaccard_sim_np_arr1
df1["common_words"] = list_of_common_words1

#df2 = pd.DataFrame()
#df2["user_id"] = range(0, 83550)
#df2["overlap_ratios"] = overlap_ratios_np_arr2
#df2["jaccard_similarity"] = jaccard_sim_np_arr2
#df2["common_words"] = list_of_common_words2

In [0]:
h = ["user_id", "overlap_ratios", "jaccard_similarity", "common_words"]
df1.to_csv(path_or_buf = "/content/drive/My Drive/Word regression/new_RQ2_3_GH_SO_past_word_regr_data.csv", header = h, index=False)
#df2.to_csv(path_or_buf = "/content/drive/My Drive/Word regression/new_RQ2_3_GH_SO_recent_word_regr_data.csv", header = h, index=False)

# RQ3: Frequent common words

This satisfied RQ3: What  knowledge  is  transferred/transferable from one  platform to another?

In [0]:
def get_frequent_common_words(A_wordRegrObj_path, B_wordRegrObj_path):
  list_of_common_words = []
  list_of_all_common_words = []
  for user_i in range(0, 83550):
    if user_i % 10000 == 0:
      print(user_i)
    A_regr = load_word_regression_data(A_wordRegrObj_path, user_i)
    B_regr = load_word_regression_data(B_wordRegrObj_path, user_i)

    common_words = get_common_words(A_regr, B_regr)
    list_of_common_words.append(common_words)
      
    for word_i in common_words:
      list_of_all_common_words.append(word_i)

  counter = count_frequent_words(list_of_all_common_words)

  print('Top 50 most common words:')
  for word, count in counter.most_common(50):
      print('%s = %d' % (word, count))
  return counter, list_of_common_words

## Find out which are the most common words in GH_past - SO_past comparison

In [16]:
load_path = '/content/drive/My Drive/Word regression/word_regr_objects/'
A_path = load_path + "RQ2_GH_past_word_regression.shlf"
B_path = load_path + "RQ2_SO_past_word_regression.shlf"

freq_counter1, list_of_common_words1 = get_frequent_common_words(A_path, B_path)

0
10000
20000
30000
40000
50000
60000
70000
80000
Top 50 most common words:
library = 43123
code = 37621
simple = 32762
type = 30948
javascript = 30044
project = 26255
web = 25083
tool = 24967
https = 24738
file = 24737
html = 22333
github = 21317
script = 20318
source = 20266
language = 19855
base = 16798
implementation = 16670
client = 15833
test = 15723
http = 15674
page = 15423
game = 13890
website = 13255
package = 12982
repository = 11690
add = 11421
method = 11421
line = 11252
api = 11144
datum = 10980
open = 10684
make = 9767
change = 9767
remove = 9767
function = 9767
work = 9767
case = 9767
check = 9767
comment = 9767
return = 9767
fix = 9767
good = 9767
error = 9767
google = 9406
application = 9322
programming = 8857
heroku = 8229
repo = 7977
org = 7977
markdown = 7977


## Find out which are the most common words in GH_recent - SO_recent comparison

In [0]:
load_path = "/content/drive/My Drive/Word regression/word_regr_objects/"
A_path = load_path + "RQ2_GH_recent_word_regression.shlf"
B_path = load_path + "RQ2_SO_recent_word_regression.shlf"

freq_counter2, list_of_common_words2 = get_frequent_common_words(A_path, B_path)

0
10000
20000
30000
40000
50000
60000
70000
80000
Top 50 most common words:
test = 56302
simple = 51226
library = 44781
app = 43750
api = 35881
base = 26075
client = 25381
code = 22052
file = 21538
application = 19620
https = 16764
ruby = 15764
rail = 15764
 = 15764
ember = 15764
gem = 15764
heroku = 15764
buildpack = 15764
cli = 15764
rb = 15764
activerecord = 15764
rspec = 15764
active = 15764
github = 15297
web = 12890
add = 12849
change = 12849
remove = 12849
check = 12849
make = 11231
line = 11231
work = 11231
case = 11231
comment = 11231
return = 11231
set = 11231
type = 11064
javascript = 11040
function = 9673
method = 9555
project = 9426
user = 7713
extension = 4781
source = 4662
image = 4649
google = 4427
android = 3194
mobile = 3194
shell = 3010
script = 2950


# RQ4: Changed Ratio

This satisfied RQ4: Do developer’s expertise change over time ?  If so, how is the evolution of expertise different from SO to GH?

In [0]:
def get_word_regr_analyze_data(A_name, B_name, A_path, B_path):
  path = "/content/drive/My Drive/Word regression/"
  header = ['internal_userID', 'changed_ratio', 'Jaccard Distance', 'numberOf_common_keywords', 'lengthOf_' + A_name, 'lengthOf_' + B_name, 
            'num_keywords_ratio', 'coeff_change.mean', 'coeff_change.variance', 'coeff_change.min', 'coeff_change.max']

  with open(path + A_name + '_to_' + B_name + 'RQ4_word_regr_data.csv','w') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(header) # write header
    print(A_name + '_to_' + B_name + ' :')
    for user_i in range(0, 83550):
      if user_i % 10000 == 0:
        print(user_i)
      A_regr = load_word_regression_data(A_path, user_i)
      B_regr = load_word_regression_data(B_path, user_i)

      # 1. Get ration of changed number of words / total number of words
      if len(A_regr.keys()) > 0:
        changed_ratio = get_changed_ratio(A_regr, B_regr)
      else:
        changed_ratio = "N/A" 

      # 2. Get ration of changed number of words / total number of words
      jaccard_dist = get_jaccard_dist(A_regr.keys(), B_regr.keys())
       
      # 3. Number of common keywords
      common_keywords_num = get_common_word_count(A_regr, B_regr)

      # 4. Past/Present ratio of number of keywords:
      num_keywords_ratio = get_keyword_num_ratio(A_regr, B_regr)

      if common_keywords_num > 0:
        # 5. Average change the common keywords' coefficents
        coeff_change_stats = get_stats_on_coeffChange(A_regr, B_regr)

        content = [user_i, changed_ratio, jaccard_dist, common_keywords_num, len(A_regr.keys()), len(B_regr.keys()), 
                   num_keywords_ratio, coeff_change_stats.mean, coeff_change_stats.variance, 
                  coeff_change_stats.minmax[0], coeff_change_stats.minmax[1]]   
      else:
        content = [user_i, changed_ratio, jaccard_dist, common_keywords_num, len(A_regr.keys()), 
                   len(B_regr.keys()), num_keywords_ratio] + ["N/A"] * 4
      csv_writer.writerow(content)


## Find out if expertise change over time when comparing GH_past - GH_recent

In [20]:
A_path = load_path + "RQ4_GH_past_word_regression.shlf"
B_path = load_path + "RQ4_GH_recent_word_regression.shlf"

get_word_regr_analyze_data("GH_past", "GH_recent", A_path, B_path)

GH_past_to_GH_recent :
0


  **kwargs)
  ret = ret.dtype.type(ret / rcount)


10000
20000
30000
40000
50000
60000
70000
80000


## Find out if expertise change over time when comparing SO_past - SO_recent

In [0]:
A_path = load_path + "RQ4_SO_past_word_regression.shlf"
B_path = load_path + "RQ4_SO_recent_word_regression.shlf"

get_word_regr_analyze_data("SO_past", "SO_recent", A_path, B_path)

SO_past_to_SO_recent :
0


  **kwargs)
  ret = ret.dtype.type(ret / rcount)


10000
20000
30000
40000
50000
60000
70000
80000


# Other

Potential metrics:

1.   Number of common keywords
2.   Average change the common keywords' coefficents
3.   Past/Present ratio of number of keywords
4.   Statistical summaries of the coefficients (percentiles, means, medians,min max)
5.   Research and figure out how to compare distrubutions from past and present


In [0]:
def get_coeff_distr(A_name, B_name, A_path, B_path):
  path = "/content/drive/My Drive/Word regression/"
  header = ['user_id', A_name + '_coeff.numObs', A_name + '_coeff.mean', 
            A_name + '_coeff.variance', A_name + '_coeff.min', A_name + '_coeff.max', 
            B_name + '_coeff.numObs', B_name + '_coeff.mean', B_name + '_coeff.variance', 
            B_name + '_coeff.min',  B_name + '_coeff.max']

  with open(path + A_name + '_to_' + B_name + '_coeff_distr.csv','w') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(header) # write header
    
    for user_i in range(0, 83550):
      if user_i % 10000 == 0:
        print(user_i)
      A_regr = load_word_regression_data(A_path, user_i)
      B_regr = load_word_regression_data(B_path, user_i)

      A_coeff_stats = get_stats_on_all_coeff(A_regr)
      B_coeff_stats = get_stats_on_all_coeff(B_regr)

      content = [user_i, A_coeff_stats.nobs, A_coeff_stats.mean, A_coeff_stats.variance,
                A_coeff_stats.minmax[0], A_coeff_stats.minmax[1],
                B_coeff_stats.nobs, B_coeff_stats.mean, B_coeff_stats.variance,
                B_coeff_stats.minmax[0], B_coeff_stats.minmax[1] ]
      csv_writer.writerow(content)