Machine Learning
=====

##### Important concepts
* ENG - Electrical Number Group - a grouping used to divide product into rough categories.
* ETIM class - a grouping used to divide products into detailed categories.
* Technical description - a text field describing each product.

In [1]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import scipy.sparse as sp

%config InlineBackend.figure_format = 'retina'

# Change working directory to get the file
os.chdir('/Users/pkg/Springboard')

# Open pickled file from the data wrangling section and set working directory
with open('data_wrangling.pickle', 'rb') as handle:
    df = pickle.load(handle)
    
os.chdir('/Users/pkg/Springboard/Intermediate Data Science with Python/Python_Capstone')

In [2]:
# Separating the labels from the rest of the data set
labels = df['ETIM_class']
ex_var = df[['EN_group', 'Technical_description']]

print("labels'  shape:", labels.shape)
print("ex_var's shape:", ex_var.shape)
print("")
print("labels is of type", type(labels))
print("ex_var is of type", type(ex_var))
print("")
print("first label:", labels.iloc[0])
print("first ENG:", ex_var.iloc[0,0])
print("first technical description:", ex_var.iloc[0,1])

labels'  shape: (202216,)
ex_var's shape: (202216, 2)

labels is of type <class 'pandas.core.series.Series'>
ex_var is of type <class 'pandas.core.frame.DataFrame'>

first label: EC003251
first ENG: 10
first technical description: VDF/EMC frekvensomformerkabel med symmetrisk jordleder. Dobbel skjerming bestående av folie og flettet fortinnet kobberskjerm. Laget for å gi lavest mulig koblingsimpedans. Kan benyttes utendørs. For spenning 0,6/1Kv med en testspenning på 4000V


In [3]:
# Creating a Pandas Series of the technical descriptions
text = df['Technical_description']

# Creating the corpus
vectorizer = CountVectorizer()

# Build the vocabulary
vectorizer.fit(text)

# Convert text to a bag of words, returns a Compressed Sparse Row matrix
# This is suitable for a matrix that is primarily made up of zeroes.
x = vectorizer.transform(text)

# Convert CSR_matrix to a dense matrix for easier slicing where necessary
#x_mat = x.todense()

# Convert to array
#x = x.toarray()

In [4]:
# Let's take a look at the first technical description
print("After the vectorization, there are", x[0].sum(), "words in the first technical description.")

After the vectorization, there are 32 words in the first technical description.


In [5]:
# This is how the vectorizer has counted the frequency of the words in the first technical description.
# The first column constains the order of the almost 100 000 words in the corpus, the second column contains the
# word in the technical description and the third is the frequency of that word in the first technical description.
# Note that since two words occurs twice, only the 30 first rows are needed (instead of 32).
# Also note how "VDF/EMC" is vectorized to "vdf" and "emc", and "0,6/1Kv" is vectorized to just "1kv" and
# how all letters are in lower case.

first = []
for i in range(30):
    for key, value in vectorizer.vocabulary_.items():
        if value == x.indices[i]:
            first.append({'order': value, 'frequency': x.data[i], 'word': key})
            
first = pd.DataFrame(first)[['order','word', 'frequency']].set_index('order')
print(first)
text.iloc[0]

                        word  frequency
order                                  
4832                     1kv          1
11396                  4000v          1
21095                     av          1
23010               benyttes          1
23327              bestående          1
29895                 dobbel          1
32279                    emc          1
32344                     en          1
36393                flettet          1
36714                  folie          1
36747                    for          2
37598              fortinnet          1
37965  frekvensomformerkabel          1
39633                     gi          1
47542              jordleder          1
48683                    kan          1
50231           kobberskjerm          1
50332       koblingsimpedans          1
53403                  laget          1
53897                 lavest          1
58234                    med          2
60915                  mulig          1
63434                     og          1


'VDF/EMC frekvensomformerkabel med symmetrisk jordleder. Dobbel skjerming bestående av folie og flettet fortinnet kobberskjerm. Laget for å gi lavest mulig koblingsimpedans. Kan benyttes utendørs. For spenning 0,6/1Kv med en testspenning på 4000V'

In [6]:
'''
word_features = vectorizer.get_feature_names()
print(word_features.index('vdf'))
print(x[0, word_features.index('vdf')])
print(x[0, vectorizer.vocabulary_['vdf']])
print(type(x))
print(x.indices[0])
'''

"\nword_features = vectorizer.get_feature_names()\nprint(word_features.index('vdf'))\nprint(x[0, word_features.index('vdf')])\nprint(x[0, vectorizer.vocabulary_['vdf']])\nprint(type(x))\nprint(x.indices[0])\n"

In [7]:
'''
# All tokens in corpus
print("Number of products:", x.shape[0])
print("Number of tokens in the corpus:", x.shape[1])

print("")
print("Tokens:",vectorizer.get_feature_names())
'''

'\n# All tokens in corpus\nprint("Number of products:", x.shape[0])\nprint("Number of tokens in the corpus:", x.shape[1])\n\nprint("")\nprint("Tokens:",vectorizer.get_feature_names())\n'

In [8]:
print(x.shape)
print(df['ETIM_class'].shape)

print(type(x))
print(type(df['ETIM_class']))

(202216, 94697)
(202216,)
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>


In [9]:
x_df = pd.DataFrame(x.toarray(), index=df.index)

print(x_df.head(), df['EN_group'].head())

x_df.insert(0, 'ENG', df['EN_group'])

               0      1      2      3      4      5      6      7      8      \
ProductNumber                                                                  
1000000            0      0      0      0      0      0      0      0      0   
1000001            0      0      0      0      0      0      0      0      0   
1000003            0      0      0      0      0      0      0      0      0   
1000004            0      0      0      0      0      0      0      0      0   
1000005            0      0      0      0      0      0      0      0      0   

               9      ...    94687  94688  94689  94690  94691  94692  94693  \
ProductNumber         ...                                                      
1000000            0  ...        0      0      0      0      0      0      0   
1000001            0  ...        0      0      0      0      0      0      0   
1000003            0  ...        0      0      0      0      0      0      0   
1000004            0  ...        0     

In [10]:
print(x_df.head())
print(x_df.shape)

              ENG  0  1  2  3  4  5  6  7  8  ...    94687  94688  94689  \
ProductNumber                                 ...                          
1000000        10  0  0  0  0  0  0  0  0  0  ...        0      0      0   
1000001        10  0  0  0  0  0  0  0  0  0  ...        0      0      0   
1000003        10  0  0  0  0  0  0  0  0  0  ...        0      0      0   
1000004        10  0  0  0  0  0  0  0  0  0  ...        0      0      0   
1000005        10  0  0  0  0  0  0  0  0  0  ...        0      0      0   

               94690  94691  94692  94693  94694  94695  94696  
ProductNumber                                                   
1000000            0      0      0      0      0      0      0  
1000001            0      0      0      0      0      0      0  
1000003            0      0      0      0      0      0      0  
1000004            0      0      0      0      0      0      0  
1000005            0      0      0      0      0      0      0  

[5 rows x 9

In [11]:
import sys
def sizeof_fmt(num, suffix='b'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['',' K',' M',' G',' T',' P',' E',' Z']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

                          x_df: 142.7 Gb
                            df: 110.9 Mb
                        ex_var:  98.3 Mb
                          text:  86.9 Mb
                        labels:  24.1 Mb
                         first:   2.4 Kb
               CountVectorizer:   1.4 Kb
               TfidfVectorizer:   1.0 Kb
                 MultinomialNB:   1.0 Kb
                           _i5:   934.0b


In [None]:
x_df = x_df.tocsc()

In [None]:
# Setting up X and y
X = x_df
y = labels

# Create the test and training sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y)

# Train the classifier over the training set, and test on the test set
clf = MultinomialNB().fit(xtrain, ytrain)
NB_train_accuracy = clf.score(xtrain, ytrain)
NB_test_accuracy = clf.score(xtest, ytest)

# Accuracy scores for both the training and test sets
print("Training accuracy:", round(NB_train_accuracy, 2))
print("Testing accuracy", round(NB_test_accuracy, 2))

In [None]:
words = np.array(vectorizer.get_feature_names())
z = np.eye(x.shape[1])

In [None]:
probs = clf.predict_log_proba(z)[:, 0] # takes 6,5min

In [None]:
ind = np.argsort(probs)

print(words[ind[:10]])
print(words[ind[-10:]])

print(probs[ind[:10]])
print(probs[ind[-10:]])

In [None]:
top_ten_view = pd.DataFrame(top_ten_view)[['order','word', 'frequency']].set_index('order')
top_ten_view

In [None]:
z = np.eye(x.shape[1])
probs = clf.predict_log_proba(z)[:, 0]
ind = np.argsort(probs)

In [None]:
good_words = words[ind[:10]]
bad_words = words[ind[-10:]]

good_prob = probs[ind[:10]]
bad_prob = probs[ind[-10:]]

print("Good words\t     P(fresh | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Bad words\t     P(fresh | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))


    
    
'''
text

text



vectorizer = TfidfVectorizer()
response = vectorizer.fit_transform(df['Technical_description'])

print(vectorizer.get_feature_names())

print(response)

dictionary = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

print(max(dictionary, key=dictionary.get), dictionary[max(dictionary, key=dictionary.get)])

response

df = pd.DataFrame(response.toarray(), columns= vectorizer.get_feature_names())

df['00000'].value_counts()
'''