In [None]:
%matplotlib inline

from code import nlp
from code import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import matplotlib.pyplot as plt

CATEGORIES = ["Personal info", "Internal", "Customer info", "Public"]

To be able to apply machine learning on our data we need to vectorize our text.

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, max_features=8000)
clf = BernoulliNB()

In [None]:
files_df = pd.read_pickle("data/custom_files_labels_done.pkl")
print(files_df.shape)
print(files_df.head(10))

In [None]:
files_df["name_transformed"] = files_df["name"].apply(nlp.stringtofeatures)
print(files_df.head(10))

In [None]:
validation_set = files_df[files_df["label"] != ''].sample(frac=.13, random_state=42)
train_set = files_df.drop(validation_set.index)
print len(validation_set)
print len(train_set)

In [None]:
progress = 10
progress_stats = []
predicted_stats = []
accuracy = []
predicted_stats.append(len(train_set["label"][train_set["label"] != '']))
progress_stats.append(train_set["label"][train_set["label"] != ''].value_counts())
print train_set["label"][train_set["label"] != ''].value_counts()
print ("{} out of {}").format(len(train_set["label"][train_set["label"] != '']), len(train_set["label"]))

vectorizer.fit(train_set[train_set['label'] != '']['name_transformed'].values)
clf = BernoulliNB()
for row in utils.chunker(train_set[train_set['label'] != ''], 100):
    clf.partial_fit(vectorizer.transform(row['name_transformed']), row['label'], CATEGORIES)
accuracy.append(clf.score(vectorizer.transform(validation_set["name"]), validation_set["label"]))
print accuracy


while progress != 0:
    vectorizer.fit(train_set[train_set['label'] != '']['name_transformed'].values)
    clf = BernoulliNB()
    
    for row in utils.chunker(train_set[train_set['label'] != ''], 100):
            clf.partial_fit(vectorizer.transform(row['name_transformed']), row['label'], CATEGORIES)

    predicted_files = 0
    non_predicted_files = 0

    for index, row in train_set.iterrows():
        if row['label'] == '':
            vectorized_filename = vectorizer.transform([row['name_transformed']]).toarray()

            if np.float64(1) in clf.predict_proba(vectorized_filename):
                train_set.loc[index, "label"] = clf.predict(vectorized_filename)[0]
                predicted_files = predicted_files + 1
            else:
                non_predicted_files = non_predicted_files + 1
    
    print("Number of predicted files with probability of 1: {}").format(predicted_files)
    vectorizer.fit(train_set[train_set['label'] != '']['name_transformed'].values)
    clf = BernoulliNB()
    for row in utils.chunker(train_set[train_set['label'] != ''], 100):
        clf.partial_fit(vectorizer.transform(row['name_transformed']), row['label'], CATEGORIES)
    acc = clf.score(vectorizer.transform(validation_set["name"]), validation_set["label"])
    print acc
    accuracy.append(acc)
    #print files_df["label"][files_df["label"] != ''].value_counts()
    progress_stats.append(train_set["label"][train_set["label"] != ''].value_counts())
    predicted_stats.append(predicted_files)
    progress = predicted_files
print "No more progress"
print train_set["label"][train_set["label"] != ''].value_counts()
print ("{} out of {}").format(len(train_set["label"][train_set["label"] != '']), len(train_set["label"]))


In [None]:
progress_frame = pd.DataFrame(data=progress_stats)

plt.figure()
fig, ax = plt.subplots()
#plt.ylabel("Number of files added to classifier")
plt.xlabel("Iteration")
plt.title("Logarithmic view of learning rate compared to accuracy")
ax2 = ax.twinx()
ax.set_yscale('log')
ax.plot(predicted_stats[:], color='#267f8c', linewidth=2., label="N files predicted")
ax2.plot(accuracy, color='#abc433', linewidth=2., label="Accuracy")
#ax2.set_ylabel("Accuracy")

ax.legend(loc=2)
ax2.legend()
fig.savefig('output/Logarithmic_view_of_learning_rate.png', dpi=1000)
plt.show()


plt.figure()
fig, ax = plt.subplots()
plt.title("Knowledge in classifier before/after semi supervised learning")
ax2 = ax.twinx()
progress_frame.iloc[0].plot(rot=45, color=['#267f8c', '#7db686', '#abc433', '#d3d724'], ylim=[0,1800], position=0.6, kind="bar", width=0.4, ax=ax)
progress_frame.iloc[-1].plot(rot=45, color=['#267f8c', '#7db686', '#abc433', '#d3d724'], ylim=[0,1800], position=0.4, kind="bar", width=0.4, ax=ax2)
plt.gcf().subplots_adjust(bottom=0.3)
fig.savefig('output/Knowledge_in_classifier.png', dpi=1000)

plt.show()



plt.figure()
ax = progress_frame.plot(color=['#267f8c', '#7db686', '#abc433', '#d3d724'], linewidth=2.)
ax.xaxis.set_visible(False)
plt.title("Knowledge gain per category")
plt.ylabel("Number of items in classifier")
ax.get_figure().savefig('output/Knowledge_gain_per_category.png', dpi=1000)
plt.show()

In [None]:
print non_predicted_files
print predicted_files

print progress_frame



In [None]:
joblib.dump(clf, "data/joblib/clf_final_iter.pkl")
joblib.dump(vectorizer, "data/joblib/vectorizer_final_iter.pkl")

In [None]:
files_df.to_pickle("data/custom_files_labels_final_iter.pkl")