In [None]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from stop_words import get_stop_words
from sklearn.metrics import classification_report
import csv
from ipywidgets import widgets
from IPython.display import display
from playsound import playsound

%config InlineBackend.figure_format = 'retina'

In [None]:
# Change working directory to get the file
try:
    os.chdir('/Users/pkg/Documents/OneDrive - Elektroforeningen (EFO)/Python/Intermediate DS')
except:
    os.chdir('/Users/paal/OneDrive - Elektroforeningen (EFO)/Python/Intermediate DS')

# Open pickled file from the data wrangling section and set working directory
with open('data_wrangling.pickle', 'rb') as handle:
    df = pickle.load(handle)
    
try:
    os.chdir('/Users/pkg/Documents/OneDrive - Elektroforeningen (EFO)/Python/Intermediate DS/Intermediate Data Science with Python/Python_Capstone')
except:
    os.chdir('/Users/paal/OneDrive - Elektroforeningen (EFO)/Python/Intermediate DS/Intermediate Data Science with Python/Python_Capstone')

In [None]:
# Sort by df by ETIM_classes for the clf.predict_proba-method later
df = df.sort_values(['ETIM_class', 'ENG'], ascending=True)

In [None]:
# Separating the labels from the rest of the data set
labels = df['ETIM_class']
ex_var = df[['ENG', 'Technical_description']]

# Display some attributes of the dataset
print("labels'  shape:", labels.shape)
print("ex_var's shape:", ex_var.shape)
print("")
print("labels is of type", type(labels))
print("ex_var is of type", type(ex_var))
print("")
print("first label after sorting:", labels.iloc[0])
print("first ENG after sorting:", ex_var.iloc[0,0])
print("first technical description after sorting:", ex_var.iloc[0,1])

In [None]:
# Importing stop words that will be ignored
stop_words = get_stop_words('norwegian')

# Creating a Pandas Series of the technical descriptions
text = df['Technical_description']

# Creating the corpus
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,2), min_df=2)

# Build the vocabulary
vectorizer.fit(text)

# Convert text to a bag of words, returns a Compressed Sparse Row matrix
# This is suitable for a matrix that is primarily made up of zeroes.
x = vectorizer.transform(text)

In [None]:
# Setting up X and y
X = x
y = labels

# Create the test and training sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=99)

In [None]:
# Train the classifier over the training set, and test on the test set
clf = MultinomialNB(alpha=0.0000000001).fit(xtrain, ytrain)
#NB_train_accuracy = clf.score(xtrain, ytrain)
#NB_test_accuracy = clf.score(xtest, ytest)

# Accuracy scores for both the training and test sets
#print("Training accuracy:", round(NB_train_accuracy, 2))
#print("Testing accuracy", round(NB_test_accuracy, 2))

In [None]:
# Creating the classification report
#y_pred_train = clf.predict(xtrain)
#y_pred_test = clf.predict(xtest)

#print(classification_report(y_pred_train, ytrain))
#print(classification_report(y_pred_test, ytest))

In [None]:
# Dictionary to connect the ETIM-class-codes and the descriptions (in Norwegian)
ETIM_dict = pd.read_csv('ETIM7.csv', header=None, sep=';', index_col=0).to_dict()[1]
#ETIM_dict

In [None]:
# Query for single technical descriptionsvestre v
inputText = widgets.Text(value='Fyll inn teknisk beskrivelse her')

# Create query
def predict_ETIM(sender):

    # Creating technical description to be evaluated by the model
    text = inputText.value
    example = clf.predict_proba(vectorizer.transform([text])).flatten()

    # Display the predicted class
    test = clf.predict(vectorizer.transform([text]))
    #test[0]

    # Create a dictionary for  the classes and probabilities
    example_dict = dict(zip(sorted(ytrain.unique()), example))
    #example_dict

    # Display the top 3 predictions, with classes and probabilities
    top3 = {k: example_dict[k] for k in sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:3]}
    #top3[test[0]]

    # Shorten to integer percentage
    #"%.0f" % (top3[test[0]] * 100)
    top3s = sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:3]

    print('Tekst:', text)
    print('')
    
    for i in range(len(top3s)):

        if top3[top3s[i]] < 0.1:
            break
        else:
            print('Forslag', int(i+1), ':')
            print('Forventet ETIM-klasse:', top3s[i], ETIM_dict[top3s[i]])
            print('Forventet treffsikkerhet:', "%.0f" % (top3[top3s[i]] * 100), '%')
            print('')
    
inputText.on_submit(predict_ETIM)
inputText

In [None]:
## Importing Excel-file with technical descriptions that need to be classified

# Assign spreadsheet filename to 'file'
file = 'import_mw.xlsx'

# Load spreadsheet
xl = pd.ExcelFile(file).parse('Sheet1')

# See first technical description and length of file
print(xl.iloc[0,:])
len(xl)

In [None]:
# Create an empty list to store the analysis results
rows_list = []

# Predict ETIM-classes for a number of technical descriptions (for use in analyze_xl, not standalone use)
# For standalone use, see predict_ETIM()
def predict_ETIM_raw(elnr, text):
    
    # Creating technical description to be evaluated by the model
    example = clf.predict_proba(vectorizer.transform([text])).flatten()

    # Display the predicted class
    single_prod = clf.predict(vectorizer.transform([text]))

    # Create a dictionary for the classes and probabilities
    prob_dict = dict(zip(sorted(ytrain.unique()), example))

    # Top 3 predictions, with classes and probabilities
    top3 = {k: prob_dict[k] for k in sorted(prob_dict, key=prob_dict.__getitem__, reverse=True)[:3]}

    # Top 3 predictions in sorted order
    top3s = sorted(prob_dict, key=prob_dict.__getitem__, reverse=True)[:3]

    for i in range(len(top3)):
        
        single_prod = {elnr: [k for k in [text, top3s[i], ETIM_dict[top3s[i]], "%.0f" % (top3[top3s[i]] * 100)]]}
        
        if top3[top3s[i]] < 0.1:
            break
        else:
            print(single_prod)
            temp_dict = {}
            temp_dict.update(single_prod)
            rows_list.append(temp_dict)
           

In [None]:
# Run the analysis on the data from the Excel-file
def analyze_xl(file):
    
    # Find text from Excel-file to analyze
    for j in range(len(xl)):
        predict_ETIM_raw(xl.iloc[j,0], xl.iloc[j,1])

In [None]:
# Run the analysis (prints results and writes to rows_list)
analyze_xl(xl)

In [None]:
# Convert the dictionary to a dataframe
output_df = pd.DataFrame(rows_list).T
output_df

In [None]:
# Squeeze the values to the left
def squeeze_nan(x):
    original_columns = x.index.tolist()

    squeezed = x.dropna()
    squeezed.index = [original_columns[n] for n in range(squeezed.count())]

    return squeezed.reindex(original_columns, fill_value=np.nan)

# Run the function and remove rightmost columns with only Nan
# Nr. of columns need to be correct according to output_df above
output_df = output_df.apply(squeeze_nan, axis=1)
output_df = output_df.dropna(axis=1, how='all')
output_df.columns = ['anbf1', 'anbf2', 'anbf3']
output_df

In [None]:
# Concatinate
output_df1 = pd.concat([output_df['anbf1'], output_df['anbf1'].apply(pd.Series).add_prefix('nr_')], axis=1)
output_df2 = pd.concat([output_df['anbf2'], output_df['anbf2'].apply(pd.Series).add_prefix('nr_')], axis=1)
output_df3 = pd.concat([output_df['anbf3'], output_df['anbf3'].apply(pd.Series).add_prefix('nr_')], axis=1)

In [None]:
# Concatinate files
output_df = pd.concat([output_df1, output_df2, output_df3], axis=1)
output_df.columns = output_df.columns.astype(str)
output_df

In [None]:
# Delete unconcatinated columns
output_df = output_df.drop(['anbf1', 'anbf2', 'anbf3'], 1)

output_df

In [None]:
# Rename columns
output_df.columns = ['Tekst', 'ETIM-kode', 'Navn', 'Forventet sannsynlighet', 'Tekst', 'ETIM-kode', 'Navn', 'Forventet sannsynlighet', 'Tekst', 'ETIM-kode', 'Navn', 'Forventet sannsynlighet']
output_df

In [None]:
# Export the file to an Excel-file
writer = pd.ExcelWriter('output_mw.xlsx', engine='xlsxwriter')
#worksheet.conditional_format('')
output_df.to_excel(writer, 'Sheet1')
writer.save()

# Finished!
playsound('fanfare_ff2.mp3')