Machine Learning
=====

##### Important concepts
* ENG - Electrical Number Group - a grouping used to divide product into rough categories.
* ETIM class - a grouping used to divide products into detailed categories.
* Technical description - a text field describing each product.

In [1]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import scipy.sparse as sp
from stop_words import get_stop_words
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.metrics import confusion_matrix
import csv
from ipywidgets import widgets
from IPython.display import display

%config InlineBackend.figure_format = 'retina'

# Change working directory to get the file
os.chdir('/Users/pkg/Springboard')

# Open pickled file from the data wrangling section and set working directory
with open('data_wrangling.pickle', 'rb') as handle:
    df = pickle.load(handle)
    
os.chdir('/Users/pkg/Springboard/Intermediate Data Science with Python/Python_Capstone')

In [2]:
# Sort by df by ETIM_classes for the clf.predict_proba-method later
df = df.sort_values(['ETIM_class', 'ENG'], ascending=True)

In [3]:
# Separating the labels from the rest of the data set
labels = df['ETIM_class']
ex_var = df[['ENG', 'Technical_description']]

# Display some attributes of the dataset
print("labels'  shape:", labels.shape)
print("ex_var's shape:", ex_var.shape)
print("")
print("labels is of type", type(labels))
print("ex_var is of type", type(ex_var))
print("")
print("first label after sorting:", labels.iloc[0])
print("first ENG after sorting:", ex_var.iloc[0,0])
print("first technical description after sorting:", ex_var.iloc[0,1])

labels'  shape: (202216,)
ex_var's shape: (202216, 2)

labels is of type <class 'pandas.core.series.Series'>
ex_var is of type <class 'pandas.core.frame.DataFrame'>

first label after sorting: EC000001
first ENG after sorting: 11
first technical description after sorting: Direkte tilkoblingsklemmer Al/Cu 70/300mm2,sett à 4 stk. L1/L2/L3/PEN


In [4]:
# Importing stop words that will be ignored
stop_words = get_stop_words('norwegian')

# Creating a Pandas Series of the technical descriptions
text = df['Technical_description']

# Creating the corpus
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,2), min_df=2)

# Build the vocabulary
vectorizer.fit(text)

# Convert text to a bag of words, returns a Compressed Sparse Row matrix
# This is suitable for a matrix that is primarily made up of zeroes.
x = vectorizer.transform(text)

In [5]:
# Setting up X and y
X = x
y = labels

# Create the test and training sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=99)

In [6]:
# Train the classifier over the training set, and test on the test set
clf = MultinomialNB(alpha=0.0000000001).fit(xtrain, ytrain)
NB_train_accuracy = clf.score(xtrain, ytrain)
NB_test_accuracy = clf.score(xtest, ytest)

# Accuracy scores for both the training and test sets
print("Training accuracy:", round(NB_train_accuracy, 2))
print("Testing accuracy", round(NB_test_accuracy, 2))

Training accuracy: 0.9
Testing accuracy 0.84


In [7]:
# Creating the classification report
y_pred_train = clf.predict(xtrain)
y_pred_test = clf.predict(xtest)

#print(classification_report(y_pred_train, ytrain))
#print(classification_report(y_pred_test, ytest))

In [8]:
# Creating technical description to be evaluated by the model
text = 'dimmer'
example = clf.predict_proba(vectorizer.transform([text])).flatten()

In [9]:
# Display the predicted class
test = clf.predict(vectorizer.transform([text]))
test[0]

'EC001744'

In [10]:
# Create a dictionary for  the classes and probabilities
example_dict = dict(zip(sorted(ytrain.unique()), example))
#example_dict

In [11]:
# Display the top 5 predictions, with classes and probabilities
top5 = {k: example_dict[k] for k in sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]}
print(top5)
sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]

{'EC001744': 0.20987080851972142, 'EC000025': 0.19697617181854982, 'EC002710': 0.12423120593071767, 'EC002706': 0.069734923591760034, 'EC001094': 0.044266738890940675}


['EC001744', 'EC000025', 'EC002710', 'EC002706', 'EC001094']

In [12]:
# For the sorting to find the correct classes, it's vital that the classes in the
# training set is used, not the labels for the whole dataset. The difference
# is displayed beneath.
print(labels.unique().shape, len(list(example_dict.values())))
len(ytrain.unique())

(1814,) 1771


1771

In [13]:
# Dictionary to connect the ETIM-class-codes and the descriptions (in Norwegian)
ETIM_dict = pd.read_csv('ETIM7.csv', header=None, sep=';', index_col=0).to_dict()[1]
#ETIM_dict

In [14]:
inputText = widgets.Text(value='Fyll inn teknisk beskrivelse her')

# Create query
def predict_ETIM(sender):

    # Creating technical description to be evaluated by the model
    text = inputText.value
    example = clf.predict_proba(vectorizer.transform([text])).flatten()

    # Display the predicted class
    test = clf.predict(vectorizer.transform([text]))
    #test[0]

    # Create a dictionary for  the classes and probabilities
    example_dict = dict(zip(sorted(ytrain.unique()), example))
    #example_dict

    # Display the top 5 predictions, with classes and probabilities
    top5 = {k: example_dict[k] for k in sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]}
    #top5[test[0]]

    # Shorten to integer percentage
    #"%.0f" % (top5[test[0]] * 100)
    top5s = sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]

    print('Tekst:', text)
    print('')
    
    for i in range(len(top5)):

        if top5[top5s[i]] < 0.05:
            break
        else:
            print('Forslag', int(i+1), ':')
            print('Forventet ETIM-klasse:', top5s[i], ETIM_dict[top5s[i]])
            print('Forventet treffsikkerhet:', "%.0f" % (top5[top5s[i]] * 100), '%')
            print('')
    
inputText.on_submit(predict_ETIM)
inputText

Text(value='Fyll inn teknisk beskrivelse her')

In [15]:
## Importing Excel-file with technical descriptions that need to be classified

# Assign spreadsheet filename to 'file'
file = 'testimport.xlsx'

# Load spreadsheet
xl = pd.ExcelFile(file).parse('Sheet1', index_col='elnr')

# See first technical description and length of file
print(xl.iloc[0,0])
len(xl)

FASTNØKKEL 1000V 23MM


103

In [16]:
print(xl.shape, type(xl))

(103, 1) <class 'pandas.core.frame.DataFrame'>


In [17]:
# Create an empty list to store the analysis results
rows_list = []

# Predict ETIM-classes for a number of technical descriptions (for use in analyze_xl, not standalone use)
# For standalone use, see predict_ETIM()
def predict_ETIM_raw(text):
    
    # Creating technical description to be evaluated by the model
    example = clf.predict_proba(vectorizer.transform([text])).flatten()

    # Display the predicted class
    test = clf.predict(vectorizer.transform([text]))

    # Create a dictionary for  the classes and probabilities
    example_dict = dict(zip(sorted(ytrain.unique()), example))

    # Top 5 predictions, with classes and probabilities
    top5 = {k: example_dict[k] for k in sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]}

    # Top 5 predictions in sorted order
    top5s = sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]

    for i in range(len(top5)):
        
        test = {text:[k for k in [top5s[i], ETIM_dict[top5s[i]], "%.0f" % (top5[top5s[i]] * 100)]]}
        
        if top5[top5s[i]] < 0.05:
            break
        else:
            print(test)
            temp_dict = {}
            temp_dict.update(test)
            rows_list.append(temp_dict)
           

In [None]:
# Run the analysis on the data from the Excel-file
def analyze_xl(file):
    
    # Find text from Excel-file to analyze
    for j in range(len(xl)):
        predict_ETIM_raw(xl.iloc[j,0])
        


In [None]:
# Run the analysis (prints results and writes to rows_list)
analyze_xl(xl)

{'FASTNØKKEL 1000V 23MM': ['EC002130', 'Fastnøkkel', '100']}
{'Vannpumpetang 117mm': ['EC002223', 'Vannpumpetang', '49']}
{'Vannpumpetang 117mm': ['EC011919', 'Vannpumpetang', '41']}
{'Vannpumpetang 117mm': ['EC011817', 'Tangsett', '7']}
{'SKJØTEHYLSE, SKRU 630': ['EC000910', 'Endeavslutning', '100']}
{'Dimmer LED 120W Tapas S100': ['EC000025', 'Dimmer', '100']}
{'BRENDA Taklampe Matt Nikkel': ['EC001743', 'Pendelarmatur', '92']}
{'Fib.panel 12xLCD SM Preterm': ['EC000748', 'Fiber pigtail', '100']}
{'Doro Comfort 3000': ['EC001744', 'Spotlight', '85']}
{'Doro Comfort 3000': ['EC002892', 'Tak og veggarmatur', '6']}
{'Målebånd 10m 13mm stål': ['EC000149', 'Verktøysett', '100']}
{'Heiseøre EVA-09 galv.': ['EC002600', 'Boks/kapslinger for montering på vegg/tak', '61']}
{'Heiseøre EVA-09 galv.': ['EC000447', 'Koblingsklemmer', '22']}
{'Heiseøre EVA-09 galv.': ['EC001005', 'Montasjemateriell for kabelføringssystem', '17']}
{'Cat 5e installasjonskabel 305m': ['EC001262', 'Patchekabel, kobber'

In [None]:
rows_list

In [None]:
# Convert the dictionary to a dataframe
output_df = pd.DataFrame(rows_list).T
output_df

In [None]:
# Squeeze the values to the left
def squeeze_nan(x):
    original_columns = x.index.tolist()

    squeezed = x.dropna()
    squeezed.index = [original_columns[n] for n in range(squeezed.count())]

    return squeezed.reindex(original_columns, fill_value=np.nan)

# Run the function and remove rightmost columns with only Nan
# Nr. of columns need to be correct according to output_df above
output_df = output_df.apply(squeeze_nan, axis=1)
output_df = output_df.dropna(axis=1, how='all')
output_df

In [None]:
output_df1 = pd.concat([output_df[0], output_df[0].apply(pd.Series).add_prefix('nr_')], axis=1)
output_df2 = pd.concat([output_df[1], output_df[1].apply(pd.Series).add_prefix('nr_')], axis=1)
output_df3 = pd.concat([output_df[2], output_df[2].apply(pd.Series).add_prefix('nr_')], axis=1)
output_df4 = pd.concat([output_df[3], output_df[3].apply(pd.Series).add_prefix('nr_')], axis=1)
output_df5 = pd.concat([output_df[4], output_df[4].apply(pd.Series).add_prefix('nr_')], axis=1)

In [None]:
output_df = pd.concat([output_df1, output_df2, output_df3, output_df4, output_df5], axis=1)
output_df

In [None]:
del output_df[0]
del output_df[1]
del output_df[2]
del output_df[3]
del output_df[4]

output_df

In [None]:
output_df.columns = ['ETIM-kode', 'Navn', 'Forventet sannsynlighet', 'ETIM-kode', 'Navn', 'Forventet sannsynlighet', 'ETIM-kode', 'Navn', 'Forventet sannsynlighet']
output_df

In [None]:
# Export the file to an Excel-file
writer = pd.ExcelWriter('testoutput.xlsx', engine='xlsxwriter')
output_df.to_excel(writer, 'Sheet1')
writer.save()