In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
print(sklearn.__version__)

0.22.2.post1


In [3]:
df = pd.read_csv('data/training_data.csv').convert_dtypes()

In [4]:
df

Unnamed: 0,Skill_Knowledge,Label
0,.NET,.Net
1,.NET CORE,.Net
2,A.I.,Artificial Intelligence
3,A-I,Artificial Intelligence
4,ACCESS,Access
...,...,...
144,VOICE RECOGNITION,Voice Recognition
145,WEB APPLICATION DEVELOPMENT,Web Development
146,WEB APPLICATIONS DEVELOPMENT,Web Development
147,WEB DEV,Web Development


In [5]:
df.dtypes

Skill_Knowledge    string
Label              string
dtype: object

**REMINDER:** Machine learning models in scikit-learn expect the input data to always be two dimensional and will error if given one-dimensional data. These models allow the output data to be one-dimensional.

In [6]:
X_train = np.array(df.Skill_Knowledge.tolist())
y_train_text = np.array(df.Label.tolist())

In [7]:
X_train

array(['.NET', '.NET CORE', 'A.I.', 'A-I', 'ACCESS', 'ACCESS VBA',
       'ACTIVPLANT', 'AI', 'ASK', 'ASK NETWORK', 'ASP.NET',
       'AZURE STUDIO', 'BI', 'B.I.', 'BIG DATA', 'B.O.', 'BO',
       'BUSINESS INTELLIGENCE', 'BUSINESS OBJECTS', 'C', 'C#', 'C++',
       'CAE', 'CAE SIMULATION', 'CATIA', 'CORE MQ', 'COREMQ',
       'CRYSTAL REPORTS', 'CSS', 'DATABASE DEVELOPMENT', 'DB2', 'DBA',
       'DEEP LEARNING', 'DISCRETE EVENT SIMULATION', 'DL', 'ELP GALC',
       'ENOVIA', 'ETL', 'EXCEL', 'EXCEL VBA', 'FAILURE FORECASTING',
       'FORECASTING', 'GALC', 'GIQ', 'GLOBAL INTELLIGENCE OF QUALITY',
       'HADOOP', 'HAM GALC', 'HCM GALC', 'HIVE', 'HMA GALC', 'HMI GALC',
       'HMIN GALC', 'HTML', 'IBM DB2', 'IBM QMF', 'IBM SPSS',
       'IMAGE RECOGNITION', 'JAVA', 'JAVASCRIPT', 'JUPITER', 'JUPYTER',
       'KERAS', 'KNIME', 'L.E.T.', 'L-E-T', 'LET', 'LOGISTICS',
       'MACHINE INTELLIGENCE', 'MACHINE LEARNING', 'MARIA DB', 'MARIADB',
       'MAXIMO', 'MICROSOFT ACCESS', 'MICROSOFT ACC

In [8]:
y_train_text

array(['.Net', '.Net', 'Artificial Intelligence',
       'Artificial Intelligence', 'Access', 'VBA', 'ActivPlant',
       'Artificial Intelligence', 'ASK Network', 'ASK Network',
       'Web Development', 'Azure Studio', 'Business Intelligence',
       'Business Intelligence', 'Big Data', 'Business Objects',
       'Business Objects', 'Business Intelligence', 'Business Objects',
       'C', 'C#', 'C++', 'Computer Aided Engineering',
       'Computer Aided Engineering', 'Catia', 'Core MQ', 'Core MQ',
       'Crystal Reports', 'Web Development', 'Database Development',
       'DB2', 'Database Development', 'Deep Learning', 'Simulation',
       'Deep Learning', 'GALC', 'Enovia', 'ETL', 'Excel', 'VBA',
       'Forecasting', 'Forecasting', 'GALC', 'GiQ', 'GiQ', 'Hadoop',
       'GALC', 'GALC', 'Hive', 'GALC', 'GALC', 'GALC', 'Web Development',
       'DB2', 'QMF', 'SPSS', 'Image Recognition', 'Java',
       'Web Development', 'Jupyter', 'Jupyter', 'Keras', 'KNIME',
       'L.E.T.', 'L.E.T.'

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

In [10]:
label_enc = LabelEncoder()
Y = label_enc.fit_transform(y_train_text)

In [11]:
Y

array([ 0,  0,  4,  4,  2, 70,  3,  4,  1,  1, 72,  5,  7,  7,  6,  8,  8,
        7,  8,  9, 10, 11, 13, 13, 12, 14, 14, 15, 72, 17, 16, 17, 18, 63,
       18, 23, 20, 19, 21, 70, 22, 22, 23, 24, 24, 25, 23, 23, 26, 23, 23,
       23, 72, 16, 53, 58, 27, 28, 72, 29, 29, 31, 30, 32, 32, 32, 33, 34,
       34, 35, 35, 36,  2, 70, 21, 70, 47, 48, 48, 60, 34, 37, 37,  2, 70,
       21, 48, 48, 60, 38, 39, 40, 40, 40, 39, 73, 41, 41, 41, 42, 44, 45,
       45, 43, 46, 46, 47, 48, 47, 48, 43, 49, 50, 52, 51, 54, 53, 55, 56,
       57, 61, 62, 62, 63, 61, 64, 49, 49, 65, 58, 59, 60, 66, 66, 59, 67,
       68, 69, 69, 69, 22, 59, 59, 70, 71, 72, 72, 72, 72], dtype=int64)

In [12]:
Y.shape

(149,)

In [13]:
X_train.shape

(149,)

In [14]:
classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=0.0001, random_state=42,
                          max_iter=100, tol=None))])

In [15]:
classifier.fit(X_train, Y)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                     

In [16]:
X_test = np.array(['MS EXCEL',
                   'AI',
                   'TF',
                   'MICROSOFT EXCEL',
                   'TIME SERIES FORECASTING'
                  ])

In [17]:
predicted = classifier.predict(X_test)
all_labels = label_enc.inverse_transform(predicted)

In [18]:
all_labels

array(['Excel', 'Artificial Intelligence', 'TensorFlow', 'Excel',
       'Forecasting'], dtype='<U27')

In [19]:
for item, label in zip(X_test, all_labels):
    if item in X_train:
        print(f"{item} => {label}")
    else:
        print(f"{item} => {item} - No label used, not in training data")

MS EXCEL => Excel
AI => Artificial Intelligence
TF => TensorFlow
MICROSOFT EXCEL => Excel
TIME SERIES FORECASTING => Forecasting
