In [1]:
import pandas as pd
import numpy as np

# Prep Golden Mapping

In [2]:
def remove_empty(x):
    result = sorted(list(set(x)))
    try:
        result.remove('')
    except:
        pass
    return result

golden = pd.read_csv('./2020_repologue_data/golden_set_v3.csv')

golden['alias'] = golden['alias'].replace(np.nan, '', regex=True)
golden['alias'] = golden['alias'].apply(lambda x: x+',')
golden['our_sub_topics'] = golden['our_sub_topics'].replace(np.nan, '', regex=True)

golden['free'] = golden['alias']+golden['our_sub_topics']
golden = golden[golden.free != ","]
golden['free'] = golden['free'].replace(np.nan, '', regex=True)
golden['free'] = golden['free'].apply(lambda x: x.split(','))
golden['free'] = golden['free'].apply(lambda x: sorted(list(set(x))))
golden['free'] = golden['free'].apply(lambda x: remove_empty(x))
golden['free_str'] = golden['free'].apply(lambda x: ','.join(x))

golden.rename(columns={'Topics':'topic'}, inplace=True)
golden.drop(['alias','our_sub_topics'], axis=1, inplace=True)

frees = sorted(list(set([x for row in list(golden.free) for x in row])))

# Prep Mapped Data

In [3]:
df = pd.read_csv('./2020_repologue_data/repos_152k_top228_topics_compressed_2.csv')

df['featured_topics'] = df['featured_topics'].replace(np.nan, '', regex=True)

df['original_topics'] = df['original_topics'].apply(lambda x: x.split(','))
df['featured_topics'] = df['featured_topics'].apply(lambda x: x.split(','))
df['augmented_top228topics'] = df['augmented_top228topics'].apply(lambda x: x.split(','))

df['featured_topics'] = df['featured_topics'].apply(lambda x: [] if x==[""] else x)

original_topics = sorted(list(set([x for row in list(df.original_topics) for x in row])))
featured_topics = sorted(list(set([x for row in list(df.featured_topics) for x in row])))
augmented_top228topics = sorted(list(set([x for row in list(df.augmented_top228topics) for x in row])))

# Map originals based on the Golden Mapping

def get_full_mapping(x,frees=frees,golden=golden):
    result = []
    for item in x:
        if item in frees:
            result.append(golden[golden['free_str'].str.contains(item)]['topic'].values[0])
    return result

df.insert(4,'mapped_topics', df['original_topics'].apply(get_full_mapping))

df['mapped_topics'] = df['featured_topics']+df['augmented_top228topics']+df['mapped_topics']
df['mapped_topics'] = df['mapped_topics'].apply(lambda x: sorted(list(set(x))))

mapped_topics = sorted(list(set([x for row in list(df.mapped_topics) for x in row])))

# Reports

df.insert(6, 'len_original_topics', df['original_topics'].apply(lambda x : len(x)))
df.insert(7, 'len_featured_topics', df['featured_topics'].apply(lambda x : len(x)))
df.insert(8, 'len_augmented_top228topics', df['augmented_top228topics'].apply(lambda x : len(x)))
df.insert(9, 'len_mapped_topics', df['mapped_topics'].apply(lambda x : len(x)))

print('Topic Count')
print(f"original_topics\t\t{len(original_topics)}\nfeatured_topics\t\t{len(featured_topics)}\
        \naugmented_top228topics\t{len(augmented_top228topics)}\nmapped_topics\t\t{len(mapped_topics)}")

print()

print('Averages Number of Topics Per Repository')
print(f"original_topics\t\t{df['len_original_topics'].mean()}\nfeatured_topics\t\t{df['len_featured_topics'].mean()}\
        \naugmented_top228topics\t{df['len_augmented_top228topics'].mean()}\nmapped_topics\t\t{df['len_mapped_topics'].mean()}")

Topic Count
original_topics		117865
featured_topics		344        
augmented_top228topics	228
mapped_topics		347

Averages Number of Topics Per Repository
original_topics		6.024346424655544
featured_topics		1.5948361272936233        
augmented_top228topics	2.0719031955508
mapped_topics		2.466862177835903


# Saving Prepped Data

In [4]:
to_drop = ['readme_raw']

df.drop(to_drop, axis=1, inplace=True)

df.original_topics = df.original_topics.apply(lambda x: ','.join(x))
df.featured_topics = df.featured_topics.apply(lambda x: ','.join(x))
df.augmented_top228topics = df.augmented_top228topics.apply(lambda x: ','.join(x))
df.mapped_topics = df.mapped_topics.apply(lambda x: ','.join(x))

df.to_csv('./2020_repologue_data/prepped_data.csv', index=None)

In [5]:
df = pd.read_csv('./2020_repologue_data/prepped_data.csv')

In [6]:
df

Unnamed: 0,repo_name,original_topics,featured_topics,augmented_top228topics,mapped_topics,len_original_topics,len_featured_topics,len_augmented_top228topics,len_mapped_topics,name_processed,desc_processed,readme_processed,wiki_processed,filename_processed
0,ui-router/react,"javascript,react,router,spa,state-machine,type...","javascript,typescript,react","aurelia,javascript,typescript,react","aurelia,javascript,react,typescript,unity",7,3,4,5,ui router react,ui router react,ui router react ui router provides extremely f...,,deep transition stable nest hook ref child u r...
1,novoda/gradle-android-command-plugin,"adb,gradle,gradle-plugin,novoda,open-source","gradle,open-source",gradle,"android,gradle,open-source",5,2,1,3,novoda gradle android command plugin,handy command testing android ci,gradle android command plugin build license bi...,development development branch project make wa...,monkey dpland aware suffix valuessw android va...
2,aryarohit07/GlideFaceDetectionTransformation,"android,detect-faces,face-detection,glide,visi...",android,"api,android","android,api,google",5,1,2,3,glide face detection,memory efficient android image transformation ...,glide face detection transformation download a...,,detector value glide glide android transformat...
3,tidyverse/modelr,"modelling,r",r,r,r,2,1,1,1,tidyverse mode lr,helper function modelling,badge start lifecycle stable travis build stat...,,pipe toc problem resample geom r height data n...
4,robclancy/presenter,"decorator,laravel,php,presenter","php,laravel","php,laravel","laravel,php",4,2,2,2,,decorate object using presenter primarily keep...,presenter decorate object using presenter prim...,,password authenticated route pagination cooky ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151393,brightec/KBarcode,"android,android-barcode,android-development,ba...","firebase,android","firebase,android,library","android,firebase,library",7,2,3,3,barcode,library help implement barcode scanning,bintray library help implement barcode scannin...,troubleshooting android support library versio...,calendar detekt pmd screen mockitoextensions c...
151394,icoxfog417/magenta_session,"machine-learning,music,webaudio-api",machine-learning,"machine-learning,api","api,machine-learning",3,1,2,2,ico fog session,musical keyboard music session google,magenta session code session magenta midi inst...,,target raw bundle sequence keyboard process to...
151395,a514514772/DISE-Domain-Invariant-Structure-Ext...,"computer-vision,cvpr2019,deep-learning,domain-...","computer-vision,deep-learning","computer-vision,deep-learning","computer-vision,deep-learning",5,2,2,2,domain structure extraction,pytorch implementation structure structural in...,domain invariant structure extraction pytorch ...,,cityscape psp label performance text python ne...
151396,nsqio/libnsq,"c,client-library,nsq",c,"library,c","c,google,library",3,1,2,3,io library,asynchronous c client library nsq,asynchronous c client library nsq status curre...,,upd c message http connection nsq look ut
