# Load Data

In [1]:
import pickle
import pandas as pd
import numpy as np

with open('./learning_data/X_train_raw.pkl','rb') as f:
    X_train_raw = pickle.load(f).readme_processed.values
with open('./learning_data/y_train.pkl','rb') as f:
    y_train = pickle.load(f)
with open('./learning_data/X_test_raw.pkl','rb') as f:
    X_test_raw = pickle.load(f).readme_processed.values
with open('./learning_data/y_test.pkl','rb') as f:
    y_test = pickle.load(f)
with open('./learning_data/topics.pkl','rb') as f:
    topics = pickle.load(f)

# Prep Training Data (according to approach, 100 repositories per topic)

In [2]:
from random import sample

X_train = []
y_train_label = []

X_test = list(X_test_raw)

for i in range(len(topics)):
    topic = topics[i]
    try:
        X_topic = sample(list(X_train_raw[y_train[:,i]==1]), 100)
    except:
        X_topic = list(X_train_raw[y_train[:,i]==1])
    
    X_train = X_train+X_topic
    y_train_label = y_train_label+[topic]*len(X_topic)

# Vectors

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(input='train', stop_words={'english'}, lowercase=True, analyzer='word')
tfidf_transformer = TfidfTransformer()

X_train_vectors = tfidf_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_vectors)

X_test_vectors = tfidf_vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_vectors)

# Saving results


import os
import pickle

try:
    os.makedirs('./learning_data/')
except:
    pass

with open('./learning_data/tfidf_vectorizer_mnb.pkl','wb') as f:
    pickle.dump(tfidf_vectorizer,f)
with open('./learning_data/tfidf_transformer_mnb.pkl','wb') as f:
    pickle.dump(tfidf_transformer,f)

# MNB Model

In [4]:
from sklearn.naive_bayes import MultinomialNB

mnb_clf = MultinomialNB().fit(X_train_tfidf,y_train_label)

# Saving Results

import os
import pickle

try:
    os.makedirs('./learning_data/')
except:
    pass

with open('./learning_data/mnb_clf.pkl','wb') as f:
    pickle.dump(mnb_clf,f)


train_predictions = mnb_clf.predict_proba(X_train_tfidf)

test_predictions = mnb_clf.predict_proba(X_test_tfidf)