Machine Learning
=====

##### Important concepts
* ENG - Electrical Number Group - a grouping used to divide product into rough categories.
* ETIM class - a grouping used to divide products into detailed categories.
* Technical description - a text field describing each product.

In [1]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import scipy.sparse as sp
from stop_words import get_stop_words
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC
from collections import Counter
from sklearn.metrics import confusion_matrix

%config InlineBackend.figure_format = 'retina'

# Change working directory to get the file
os.chdir('/Users/pkg/Springboard')

# Open pickled file from the data wrangling section and set working directory
with open('data_wrangling.pickle', 'rb') as handle:
    df = pickle.load(handle)
    
os.chdir('/Users/pkg/Springboard/Intermediate Data Science with Python/Python_Capstone')

In [2]:
# Sort by df by ETIM_classes for the clf.predict_proba-method later
df = df.sort_values(['ETIM_class', 'ENG'], ascending=True)

In [3]:
# Separating the labels from the rest of the data set
labels = df['ETIM_class']
ex_var = df[['ENG', 'Technical_description']]

# Display some attributes of the dataset
print("labels'  shape:", labels.shape)
print("ex_var's shape:", ex_var.shape)
print("")
print("labels is of type", type(labels))
print("ex_var is of type", type(ex_var))
print("")
print("first label after sorting:", labels.iloc[0])
print("first ENG after sorting:", ex_var.iloc[0,0])
print("first technical description after sorting:", ex_var.iloc[0,1])

labels'  shape: (202216,)
ex_var's shape: (202216, 2)

labels is of type <class 'pandas.core.series.Series'>
ex_var is of type <class 'pandas.core.frame.DataFrame'>

first label after sorting: EC000001
first ENG after sorting: 11
first technical description after sorting: Direkte tilkoblingsklemmer Al/Cu 70/300mm2,sett à 4 stk. L1/L2/L3/PEN


In [4]:
# Importing stop words that will be ignored
stop_words = get_stop_words('norwegian')

# Creating a Pandas Series of the technical descriptions
text = df['Technical_description']

# Creating the corpus
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,2), min_df=2)

# Build the vocabulary
vectorizer.fit(text)

# Convert text to a bag of words, returns a Compressed Sparse Row matrix
# This is suitable for a matrix that is primarily made up of zeroes.
x = vectorizer.transform(text)

In [5]:
# Setting up X and y
X = x
y = labels

# Create the test and training sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=99)

In [6]:
# Train the classifier over the training set, and test on the test set
clf = MultinomialNB(alpha=0.0000000001).fit(xtrain, ytrain)
NB_train_accuracy = clf.score(xtrain, ytrain)
NB_test_accuracy = clf.score(xtest, ytest)

# Accuracy scores for both the training and test sets
print("Training accuracy:", round(NB_train_accuracy, 2))
print("Testing accuracy", round(NB_test_accuracy, 2))

Training accuracy: 0.9
Testing accuracy 0.84


In [7]:
# Creating the classification report
y_pred_train = clf.predict(xtrain)
y_pred_test = clf.predict(xtest)

print(classification_report(y_pred_train, ytrain))
print(classification_report(y_pred_test, ytest))

  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

    EC000001       0.95      0.72      0.82       124
    EC000003       0.96      0.99      0.97       436
    EC000005       1.00      1.00      1.00       473
    EC000006       1.00      1.00      1.00         3
    EC000007       0.95      0.99      0.97       982
    EC000008       0.40      0.80      0.53         5
    EC000009       1.00      1.00      1.00         3
    EC000010       0.94      0.85      0.89       239
    EC000011       0.99      0.97      0.98       372
    EC000012       0.86      0.99      0.92       287
    EC000013       1.00      1.00      1.00         1
    EC000014       0.83      1.00      0.91        24
    EC000016       1.00      1.00      1.00        45
    EC000017       1.00      1.00      1.00        26
    EC000018       0.95      0.98      0.96       567
    EC000019       0.99      0.99      0.99       196
    EC000020       1.00      1.00      1.00         3
    EC000022       0.94    

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

    EC000001       0.82      0.65      0.73        49
    EC000003       0.93      0.97      0.95       143
    EC000005       0.99      0.98      0.99       149
    EC000006       0.00      0.00      0.00         0
    EC000007       0.90      0.98      0.93       294
    EC000008       0.17      1.00      0.29         1
    EC000009       0.50      1.00      0.67         1
    EC000010       0.90      0.85      0.87        73
    EC000011       0.92      0.93      0.93       127
    EC000012       0.85      0.97      0.91        91
    EC000013       1.00      1.00      1.00         2
    EC000014       0.50      0.83      0.62         6
    EC000016       1.00      0.92      0.96        12
    EC000017       0.89      1.00      0.94         8
    EC000018       0.85      0.89      0.87       193
    EC000019       0.91      0.96      0.93        67
    EC000020       0.00      0.00      0.00         1
    EC000022       0.88    

In [8]:
# Creating technical description to be evaluated by the model
text = 'dimmer'
example = clf.predict_proba(vectorizer.transform([text])).flatten()

In [9]:
# Display the predicted class
test = clf.predict(vectorizer.transform([text]))
test[0]

'EC001744'

In [10]:
# Create a dictionary for  the classes and probabilities
example_dict = dict(zip(sorted(ytrain.unique()), example))
#example_dict

In [11]:
# Display the top 5 predictions, with classes and probabilities
top5 = {k: example_dict[k] for k in sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]}
print(top5)
sorted(example_dict, key=example_dict.__getitem__, reverse=True)[:5]

{'EC001744': 0.20987080851972142, 'EC000025': 0.19697617181854982, 'EC002710': 0.12423120593071767, 'EC002706': 0.069734923591760034, 'EC001094': 0.044266738890940675}


['EC001744', 'EC000025', 'EC002710', 'EC002706', 'EC001094']

In [12]:
# For the sorting to find the correct classes, it's vital that the classes in the
# training set is used, not the labels for the whole dataset. The difference
# is displayed beneath.
print(labels.unique().shape, len(list(example_dict.values())))
len(ytrain.unique())

(1814,) 1771


1771

The following are miscellaneous code snippets that were tested but not included in the extended model.

In [13]:
'''
# Find which ETIM_classes are rare
ETIM_counts = df['ETIM_class'].value_counts()

# Last ETIM-class with more than 15 products
print((ETIM_counts > 15)[1046])
print((ETIM_counts > 15)[1047])

ETIM_counts_index = np.array(ETIM_counts[0:1046].index)

# Create boolean array which indicates which products that belong to an ETIM-class
# that occurs more than 15 times
keep_EC = []

for i in range(len(df)):
    if df['ETIM_class'].iloc[i] in ETIM_counts_index:
        keep_EC.append(1)
    else:
        keep_EC.append(0)

keep_EC = np.array(keep_EC, dtype='bool')

# Keep only those products that belong to an ETIM-class that occurs more than 15 times
df = df[keep_EC]
'''

"\n# Find which ETIM_classes are rare\nETIM_counts = df['ETIM_class'].value_counts()\n\n# Last ETIM-class with more than 15 products\nprint((ETIM_counts > 15)[1046])\nprint((ETIM_counts > 15)[1047])\n\nETIM_counts_index = np.array(ETIM_counts[0:1046].index)\n\n# Create boolean array which indicates which products that belong to an ETIM-class\n# that occurs more than 15 times\nkeep_EC = []\n\nfor i in range(len(df)):\n    if df['ETIM_class'].iloc[i] in ETIM_counts_index:\n        keep_EC.append(1)\n    else:\n        keep_EC.append(0)\n\nkeep_EC = np.array(keep_EC, dtype='bool')\n\n# Keep only those products that belong to an ETIM-class that occurs more than 15 times\ndf = df[keep_EC]\n"

In [14]:
'''
# Let's take a look at the first technical description
print("After the vectorization, there are", x[0].sum(), "words in the first technical description.")

# This is how the vectorizer has counted the frequency of the words in the first technical description.
# The first column constains the order of the almost 100 000 words in the corpus, the second column contains the
# word in the technical description and the third is the frequency of that word in the first technical description.
# After the stop-word clean-up, there are only 23 words left. The words "med", "av", "og", "for", "å", "kan", "en"
# and "på" are removed from the first technical description (see below).
# Also note how "VDF/EMC" is vectorized to "vdf" and "emc", and "0,6/1Kv" is vectorized to just "1kv" and
# how all letters are in lower case.

first = []
for i in range(x[0].sum()):
    for key, value in vectorizer.vocabulary_.items():
        if value == x.indices[i]:
            first.append({'order': value, 'frequency': x.data[i], 'word': key})
            
first = pd.DataFrame(first)[['order','word', 'frequency']].set_index('order')
print(first)
text.iloc[0]
'''

'\n# Let\'s take a look at the first technical description\nprint("After the vectorization, there are", x[0].sum(), "words in the first technical description.")\n\n# This is how the vectorizer has counted the frequency of the words in the first technical description.\n# The first column constains the order of the almost 100 000 words in the corpus, the second column contains the\n# word in the technical description and the third is the frequency of that word in the first technical description.\n# After the stop-word clean-up, there are only 23 words left. The words "med", "av", "og", "for", "å", "kan", "en"\n# and "på" are removed from the first technical description (see below).\n# Also note how "VDF/EMC" is vectorized to "vdf" and "emc", and "0,6/1Kv" is vectorized to just "1kv" and\n# how all letters are in lower case.\n\nfirst = []\nfor i in range(x[0].sum()):\n    for key, value in vectorizer.vocabulary_.items():\n        if value == x.indices[i]:\n            first.append({\'orde

In [15]:
'''
# Adding the ENGs as dummy variables
dummies = pd.get_dummies(df['ENG'])
dummies = sp.csr_matrix(dummies)

print(type(x), type(dummies))

x = sp.hstack((x, dummies))
type(x)

# Adding the ENG dummies to the df, removing the categorical variable ENG afterwards
#df = pd.concat([df, dummies], axis=1)
#del df['ENG']
#df.head()
'''

"\n# Adding the ENGs as dummy variables\ndummies = pd.get_dummies(df['ENG'])\ndummies = sp.csr_matrix(dummies)\n\nprint(type(x), type(dummies))\n\nx = sp.hstack((x, dummies))\ntype(x)\n\n# Adding the ENG dummies to the df, removing the categorical variable ENG afterwards\n#df = pd.concat([df, dummies], axis=1)\n#del df['ENG']\n#df.head()\n"