In [16]:
from flask import Flask, render_template, redirect, url_for
from flask_bootstrap import Bootstrap
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField
from wtforms.validators import DataRequired
from data import ACTORS
from modules import get_names, get_actor, get_id

def init_ml():   
    print("Initing ML")
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    import re, nltk, string
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.multiclass import OneVsRestClassifier
    from gensim.models import word2vec
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    from keras.utils import np_utils
    min_word_frequency_word2vec = 5
    embed_size_word2vec = 200
    context_window_word2vec = 5

    numCV = 10
    max_sentence_len = 50
    min_sentence_length = 15
    rankK = 10
    batch_size = 32
    
    min_bugs_solved = 500
    
    df=pd.read_csv('classifier_data_10.csv')
    


    filtered = df.groupby('owner')['owner'].filter(lambda x: len(x) >= min_bugs_solved)
    f = df[df['owner'].isin(filtered)]
    df = f

    df.dropna(inplace=True)
    df.isnull().sum()
    len(f['owner'].unique())    
    X = df.description
    y = df.owner
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)



    train_data = []
    train_owner = []
    test_data = []
    test_owner = []

    all_data_unfiltered = []

    def purge_string(text):
        current_desc = text.replace('\r', ' ')    
        current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
        start_loc = current_desc.find("Stack trace:")
        current_desc = current_desc[:start_loc]    
        current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
        current_desc = current_desc.lower()
        current_desc_tokens = nltk.word_tokenize(current_desc)
        current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
        current_data = current_desc_filter
        return current_data

    for item in X_train:
        current_data = purge_string(item)
        all_data_unfiltered.append(current_data)     
        train_data.append(filter(None, current_data)) 

    for item in y_train:
        train_owner.append(item)

    for item in X_test:
        current_data = purge_string(item)
        test_data.append(filter(None, current_data)) 

    for item in y_test:
        test_owner.append(item)
    print("train_data length = "+str(len(train_data)))
    print("train_owner length = "+str(len(train_owner)))
    print("test_data length = "+str(len(test_data)))
    print("test_owner length = "+str(len(test_owner)))
    model  = word2vec.Word2Vec(min_count=min_word_frequency_word2vec, vector_size=embed_size_word2vec, window=context_window_word2vec)
    model.init_sims(replace=True)
    model.build_vocab(all_data_unfiltered, progress_per=100000)
    vocabulary = model.wv.key_to_index
    vocab_size = len(vocabulary)

    updated_train_data = []    
    updated_train_data_length = []    
    updated_train_owner = []
    final_test_data = []
    final_test_owner = []
    for j, item in enumerate(train_data):
        current_train_filter = [word for word in item if word in vocabulary]
        if len(current_train_filter)>=min_sentence_length:  
          updated_train_data.append(current_train_filter)
          updated_train_owner.append(train_owner[j])  

    for j, item in enumerate(test_data):
        current_test_filter = [word for word in item if word in vocabulary]  
        if len(current_test_filter)>=min_sentence_length:
          final_test_data.append(current_test_filter)    	  
          final_test_owner.append(test_owner[j]) 
    unique_train_label = list(set(updated_train_owner))
    classes = np.array(unique_train_label)

    X_train = np.empty(shape=[len(updated_train_data), max_sentence_len, embed_size_word2vec], dtype='float32')
    Y_train = np.empty(shape=[len(updated_train_owner),1], dtype='int32')

    for j, curr_row in enumerate(updated_train_data):
        sequence_cnt = 0         
        for item in curr_row:
            if item in vocabulary:
                X_train[j, sequence_cnt, :] = model.wv[item] 
                sequence_cnt = sequence_cnt + 1                
                if sequence_cnt == max_sentence_len-1:
                        break                
        for k in range(sequence_cnt, max_sentence_len):
            X_train[j, k, :] = np.zeros((1,embed_size_word2vec))        
        Y_train[j,0] = unique_train_label.index(updated_train_owner[j])

    X_test = np.empty(shape=[len(final_test_data), max_sentence_len, embed_size_word2vec], dtype='float32')
    Y_test = np.empty(shape=[len(final_test_owner),1], dtype='int32')

    for j, curr_row in enumerate(final_test_data):
        sequence_cnt = 0          
        for item in curr_row:
            if item in vocabulary:
                X_test[j, sequence_cnt, :] = model.wv[item] 
                sequence_cnt = sequence_cnt + 1                
                if sequence_cnt == max_sentence_len-1:
                    break                
        for k in range(sequence_cnt, max_sentence_len):
            X_test[j, k, :] = np.zeros((1,embed_size_word2vec))        
        Y_test[j,0] = unique_train_label.index(final_test_owner[j])

    y_train = np_utils.to_categorical(Y_train, len(unique_train_label))
    y_test = np_utils.to_categorical(Y_test, len(unique_train_label))    

    train_data = []
    for item in updated_train_data:
        train_data.append(' '.join(item))

    test_data = []
    for item in final_test_data:
        test_data.append(' '.join(item))

    vocab_data = []
    for item in vocabulary:
        vocab_data.append(item)

    tfidf_transformer = TfidfTransformer(use_idf=False)
    count_vect = CountVectorizer(min_df=1, vocabulary= vocab_data,dtype=np.int32)

    train_counts = count_vect.fit_transform(train_data)       
    train_feats = tfidf_transformer.fit_transform(train_counts)
    print (train_feats.shape)

    test_counts = count_vect.transform(test_data)
    test_feats = tfidf_transformer.transform(test_counts)
    print (test_feats.shape)
    print ("=======================")

    classifierModel = MultinomialNB(alpha=0.01)        
    classifierModel = OneVsRestClassifier(classifierModel).fit(train_feats, updated_train_owner)
    predict = classifierModel.predict_proba(test_feats)  
    classes = classifierModel.classes_  
    
    print("ML inited")
    
    
    
app = Flask(__name__)

# Flask-WTF requires an enryption key - the string can be anything
app.config['SECRET_KEY'] = 'C2HWGVoMGfNTBsrYQg8EcMrdTimkZfAb'

# Flask-Bootstrap requires this line
Bootstrap(app)

# with Flask-WTF, each web form is represented by a class
# "NameForm" can change; "(FlaskForm)" cannot
# see the route for "/" and "index.html" to see how this is used
class NameForm(FlaskForm):
    name = StringField('Which actor is your favorite?', validators=[DataRequired()])
    submit = SubmitField('Submit')


# all Flask routes below

@app.route('/', methods=['GET', 'POST'])
def index():
    names = get_names(ACTORS)
    # you must tell the variable 'form' what you named the class, above
    # 'form' is the variable name used in this template: index.html
    form = NameForm()
    message = ""
    if form.validate_on_submit():
        name = form.name.data
        if name.lower() in names:
            # empty the form field
            form.name.data = ""
            id = get_id(ACTORS, name)
            # redirect the browser to another route and template
            return redirect( url_for('actor', id=id) )
        else:
            message = "That actor is not in our database."
    return render_template('index.html', names=names, form=form, message=message)

@app.route('/actor/<id>')
def actor(id):
    # run function to get actor data based on the id in the path
    id, name, photo = get_actor(ACTORS, id)
    if name == "Unknown":
        # redirect the browser to the error template
        return render_template('404.html'), 404
    else:
        # pass all the data for the selected actor to the template
        return render_template('actor.html', id=id, name=name, photo=photo)

# 2 routes to handle errors - they have templates too

@app.errorhandler(404)
def page_not_found(e):
    return render_template('404.html'), 404

@app.errorhandler(500)
def internal_server_error(e):
    return render_template('500.html'), 500


# keep this as is
if __name__ == '__main__':
    init_ml()
    app.run(debug=False)


Initing ML
train_data length = 6203
train_owner length = 6203
test_data length = 2659
test_owner length = 2659


  model.init_sims(replace=True)


(5422, 7726)
(2351, 7726)
ML inited
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [31/Oct/2021 14:18:04] "[37mGET / HTTP/1.1[0m" 200 -


AssertionError: A name collision occurred between blueprints <flask.blueprints.Blueprint object at 0x000001C7A148E220> and <flask.blueprints.Blueprint object at 0x000001C7A19127C0>. Both share the same name "bootstrap". Blueprints that are created on the fly need unique names.