In [1]:
import pandas as pd
import numpy as np

# Load and Prep Data

In [2]:
df = pd.read_csv('./2020_repologue_data/prepped_data.csv')
input_cols = [
    'name_processed',
    'desc_processed',
    'readme_processed',
    'wiki_processed',
    'filename_processed'
]
target_col = 'mapped_topics'
# target_col = 'augmented_top228topics'


for col in input_cols:
    df[col] = df[col].replace(np.nan, '', regex=True)
    df[col] = df[col].apply(lambda x: str(x))

df[target_col] = df[target_col].replace(np.nan, '', regex=True)
df = df[df[target_col]!='']
df[target_col] = df[target_col].apply(lambda x: str(x))    

df.insert(9, 'input', df[input_cols].apply(lambda x: ' '.join(x), axis=1))
df.input = df.input.str.strip()

# Droping repos with little input

df['len_input'] = df.input.apply(lambda x: len(x.split()))

df = df[df.len_input>=10]

# Droping topics assigned to less than 100 repos

df[target_col+'_str'] = df[target_col].apply(lambda x: ','+x+',')
df[target_col] = df[target_col].str.split(',')
target_topics = sorted(list(set([x for row in list(df[target_col]) for x in row])))

to_exclude = []

for t in target_topics:
    if len(df[df[target_col+'_str'].str.contains(','+t+',')])<100:
        to_exclude.append(t)

df[target_col] = df[target_col].apply(lambda x: sorted(list(set(x)-set(to_exclude))))
df[target_col+'_str'] = df[target_col].apply(lambda x: ','.join(x))
df = df[df[target_col+'_str']!='']

y_raw = df[target_col].copy()
X_raw = df[['repo_name']+input_cols+['input']].copy()

# Report

target_topics = sorted(list(set([x for row in list(df[target_col]) for x in row])))

df['len_'+target_col] = df[target_col].apply(lambda x : len(x))

print('Topic Count')
print(f"{target_col}\t\t{len(target_topics)}")

print()

print('Averages Number of Topics Per Repository')
print(f"{target_col}\t\t{np.round(df['len_'+target_col].mean(),2)}")

# Garbage disposal

del to_exclude
del target_col
del df

# Saving results

import os
import pickle

try:
    os.makedirs('./learning_data/')
except:
    pass

with open('./learning_data/X_raw.pkl','wb') as f:
    pickle.dump(X_raw,f)
with open('./learning_data/y_raw.pkl','wb') as f:
    pickle.dump(y_raw,f)

Topic Count
mapped_topics		236

Averages Number of Topics Per Repository
mapped_topics		2.45


# Onehot the Labels

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

y_onehot = mlb.fit_transform(y_raw)

topics = list(mlb.classes_)

# Saving results

import os
import pickle

try:
    os.makedirs('./learning_data/')
except:
    pass

with open('./learning_data/y_onehot.pkl','wb') as f:
    pickle.dump(y_onehot,f)
with open('./learning_data/topics.pkl','wb') as f:
    pickle.dump(topics,f)

# Train/Test: 80/20

In [4]:
stratified = True

if stratified:
    from skmultilearn.model_selection.iterative_stratification import iterative_train_test_split
    X_train_raw, y_train, X_test_raw, y_test = iterative_train_test_split(X_raw.values, y_onehot, test_size=0.2)
    X_train_raw = pd.DataFrame(X_train_raw, columns=['repo_name']+input_cols+['input'])
    X_test_raw = pd.DataFrame(X_test_raw, columns=['repo_name']+input_cols+['input'])
else:
    from sklearn.model_selection import train_test_split
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y_onehot, test_size=0.2, random_state=42)

# Saving results

import os
import pickle

try:
    os.makedirs('./learning_data/')
except:
    pass

with open('./learning_data/X_train_raw.pkl','wb') as f:
    pickle.dump(X_train_raw,f)
with open('./learning_data/y_train.pkl','wb') as f:
    pickle.dump(y_train,f)
with open('./learning_data/X_test_raw.pkl','wb') as f:
    pickle.dump(X_test_raw,f)
with open('./learning_data/y_test.pkl','wb') as f:
    pickle.dump(y_test,f)

# TF/IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                   sublinear_tf=True,
                                   strip_accents='unicode',
                                   analyzer='word',
                                   token_pattern=r'\w{2,}',
#                                    ngram_range=(1,2),
                                   max_features=30000)

X_train = tfidf_vectorizer.fit_transform(X_train_raw.input.values)
X_test = tfidf_vectorizer.transform(X_test_raw.input.values)

# Saving results

import os
import pickle

try:
    os.makedirs('./learning_data/')
except:
    pass

with open('./learning_data/tfidf_vectorizer.pkl','wb') as f:
    pickle.dump(tfidf_vectorizer,f)
with open('./learning_data/X_train.pkl','wb') as f:
    pickle.dump(X_train,f)
with open('./learning_data/X_test.pkl','wb') as f:
    pickle.dump(X_test,f)

# OVR LR

In [6]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

ovr_clf = OneVsRestClassifier(LogisticRegression(class_weight='balanced', 
#                                                  solver='sag', # the model gets way too soft, predicts avg 47 topics per repo
                                                 n_jobs=-1), n_jobs=-1)
ovr_clf.fit(X_train,y_train)

# Saving results

import os
import pickle

try:
    os.makedirs('./learning_data/')
except:
    pass

with open('./learning_data/ovr_clf.pkl','wb') as f:
    pickle.dump(ovr_clf,f)

# Results (Threshold = 50%)

In [7]:
from sklearn.metrics import classification_report

ovr_pred_train = ovr_clf.predict(X_train)

print('\n'+'*'*25+'Training Results:')
print(classification_report(y_train, ovr_pred_train, target_names=topics, digits=4, zero_division=0))

ovr_pred_test = ovr_clf.predict(X_test)

print('\n'+'*'*25+'Testing Results')
print(classification_report(y_test, ovr_pred_test, target_names=topics, digits=4, zero_division=0))


*************************Training Results:
                        precision    recall  f1-score   support

                    3d     0.3505    1.0000    0.5190      1990
               actions     0.4046    1.0000    0.5761       369
                    ai     0.2646    1.0000    0.4185      1055
                  ajax     0.3189    1.0000    0.4836       170
             algorithm     0.3473    0.9996    0.5154      2454
               android     0.6571    0.9350    0.7718      9251
               angular     0.7593    0.9996    0.8630      2360
               ansible     0.7917    1.0000    0.8837       707
                 antlr     0.1944    1.0000    0.3255      1577
                   api     0.4015    0.9297    0.5608      7920
             archlinux     0.3896    1.0000    0.5607       150
               arduino     0.7454    1.0000    0.8541       896
                aspnet     0.5136    1.0000    0.6786       473
                  atom     0.6623    1.0000    0.7969      

# Label Powerset LR

In [8]:
# from skmultilearn.problem_transform import LabelPowerset
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report

# lps_clf = LabelPowerset(classifier=LogisticRegression(n_jobs=-1, class_weight='balanced'))
# lps_clf.fit(X_train, y_train)

# # Saving results

# import os
# import pickle

# try:
#     os.makedirs('./learning_data/')
# except:
#     pass

# with open('./learning_data/lps_clf.pkl','wb') as f:
#     pickle.dump(lps_clf,f)
    
# # Checking results

# lps_pred_train = lps_clf.predict(X_train)

# print('\n'+'*'*25+'Training Results:')
# print(classification_report(y_train, lps_pred_train, zero_division=0))

# lps_pred_test = lps_clf.predict(X_test)

# print('\n'+'*'*25+'Testing Results')
# print(classification_report(y_test, lps_pred_test, zero_division=0))

# Classifier Chain LR

In [9]:
# from skmultilearn.problem_transform import ClassifierChain
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report


# cc_clf = ClassifierChain(classifier=LogisticRegression(n_jobs=-1, class_weight='balanced'))
# cc_clf.fit(X_train, y_train)

# # Saving results

# import os
# import pickle

# try:
#     os.makedirs('./learning_data/')
# except:
#     pass

# with open('./learning_data/cc_clf.pkl','wb') as f:
#     pickle.dump(cc_clf,f)
    
# # Checking results

# cc_pred_train = lps_clf.predict(X_train)

# print('\n'+'*'*25+'Training Results:')
# print(classification_report(y_train, cc_pred_train, zero_division=0))

# cc_pred_test = lps_clf.predict(X_test)

# print('\n'+'*'*25+'Testing Results')
# print(classification_report(y_test, cc_pred_test, zero_division=0))