In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
import os
import numpy as np
import pandas as pd
from collections import Counter
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
stop = stopwords.words('english')
import string
punctuations = list(string.punctuation)
stop = stop + punctuations

In [2]:

def ModPosTag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# def simple_clean(words):
#     output_words = [w.lower() for w in words if not w.lower() in stop]
#     return output_words

def simple_clean(words):
    output_words = [w for w in words if not w in stop]
    return output_words


# def complex_clean(words):
#     output_words = []
#     for w in words:
#         if w.lower() not in stop:
#             pos = pos_tag([w])
#             clean_w = lemmatizer.lemmatize(w, pos = ModPosTag(pos[0][1]))
#             output_words.append(clean_w.lower())
#     return output_words

# remove lemetize
def complex_clean(words):
    output_words = []
    for w in words:
        if w not in stop:
            pos = pos_tag([w])
            clean_w = lemmatizer.lemmatize(w, pos = ModPosTag(pos[0][1]))
            output_words.append(clean_w)
    return output_words


def read_words(words_dir):

    files = [os.path.join(words_dir, fi) for fi in os.listdir(words_dir)]

    #features_matrix = np.zeros((len(files), 3000))
    docID = 0;
    all_words = []
    all_words_heading = []
    major_words_abstract = []
    for fil in files:
        with open(fil) as fi:
            words_abstract = []
            for i, line in enumerate(fi):
                if (i == 0):
                    temp = line.split()
                    temp1 = simple_clean(temp)
                    
                    words_abstract += temp1
                    all_words_heading += temp1
                else:
                    temp = line.split()
                    temp1 = complex_clean(temp)
                    print(fil)
                    print()
                    print(temp1)
                    words_abstract += temp1
                    all_words += temp1

            major_words_abstract.append(words_abstract)
            len(all_words)
    return major_words_abstract, all_words, all_words_heading

def tag_updater(df, variable_code):
    for key, values in variable_code.items():
        temp = str(key)
        area = variable_code[key]
        for val in area:
            df.at[val-1, temp] = 1

    return df


def read_tags(words_dir):
    files = [os.path.join(words_dir, fi) for fi in os.listdir(words_dir)]

    # features_matrix = np.zeros((len(files), 3000))
    docID = 0;
    tagss = set()
    for fil in files:
        docID+=1
        with open(fil) as fi:
            for i, line in enumerate(fi):
                temp = line.split(',')
                tagss.update(temp)
    return docID, tagss




In [3]:
mydict = {}

def read_files(words_directory, tags_directory):

    wo = [os.path.join(words_directory, wi) for wi in os.listdir(words_directory)]
    ta = [os.path.join(tags_directory, ti) for ti in os.listdir(tags_directory)]
    ctr=0
    for (ab, t) in zip(wo, ta):
        ctr +=1
        with open(t) as su:
            for i, line in enumerate(su):
                tag = line.split(',')
                for q in tag:
                    if q not in mydict:
                        mydict[q] = []
                    mydict[q].append(ctr)

    return mydict,ctr

def extract_features(Words, features):
    feature_matrix = np.zeros((len(Words), len(features)))
    docID = 0
    for doc in Words:
        for word in doc:
            for i in range(len(features)):
                if features[i] == word:
                    wordID = i
                    feature_matrix[docID, wordID] += 1
        docID = docID + 1
    np.shape(feature_matrix)
    print(feature_matrix)
    return feature_matrix

In [4]:
words_dir = 'abstract'
tags_directory = 'tags'
Words, all_words, all_words_heading = read_words(words_dir)


abstract\1.txt

['There', 'many', 'study', 'researcher', 'attempt', 'classify', 'student', 'attentiveness.', 'Many', 'approach', 'depend', 'qualitative', 'analysis', 'lack', 'quantitative', 'analysis.', 'Therefore,', 'work', 'focus', 'bridging', 'gap', 'qualitative', 'quantitative', 'approach', 'classify', 'student', 'attentiveness.', 'Thus,', 'research', 'applies', 'machine', 'learn', 'algorithm', '(K-means', 'SVM)', 'automatically', 'classify', 'student', 'attentive', 'inattentive', 'use', 'data', 'consumer', 'RGB-D', 'sensor.', 'Results', 'research', 'use', 'improve', 'teach', 'strategy', 'instructor', 'level', 'aid', 'instructor', 'implement', 'personalize', 'learn', 'systems,', 'National', 'Academy', 'Engineering', 'Grand', 'Challenge.', 'This', 'research', 'applies', 'machine', 'learn', 'algorithm', 'educational', 'setting.', 'Data', 'algorithm', 'use', 'instructor', 'provide', 'valuable', 'feedback', 'effectiveness', 'instructional', 'strategy', 'pedagogies.', 'Instructors', 'us

abstract\19.txt

['This', 'paper', 'investigates', 'improve', 'fuzzy', 'multicategory', 'support', 'vector', 'machine', 'classifier', '(IFMSVM).', 'It', 'us', 'knowledge', 'ambiguity', 'associate', 'membership', 'data', 'sample', 'give', 'class', 'relative', 'location', 'origin,', 'improve', 'classification', 'performance', 'high', 'generalization', 'capability.', 'In', 'aspects,', 'classify', 'accuracy', 'new', 'algorithm', 'well', 'classical', 'support', 'vector', 'classification', 'algorithms.', 'Numerical', 'simulation', 'show', 'feasibility', 'effectiveness', 'algorithm']
abstract\2.txt

['This', 'paper', 'firstly', 'analysis', 'actual', 'underwriting', 'method', 'Chinese', 'life', 'insurance', 'companies,', 'point', 'merit', 'shortcoming', 'methods.', 'Then', 'incomplete', 'database', 'insurance', 'company', 'mine', 'data', "mining's", 'association', 'rule', 'algorithm.', 'Thirdly', 'support', 'vector', 'machine', '(SVM)', 'apply', 'underwriting', 'process', 'classify', 'applican

abstract\28.txt

['The', 'rapid', 'converge', 'big', 'data', 'IoT', '(Internet', 'Things)', 'technology', 'provide', 'opportunity', 'area', 'road', 'traffic', 'applications.', 'In', 'paper,', 'discus', 'timeline', 'visualization', 'tool', 'enables', 'u', 'well', 'understand', 'traffic', 'behavior', 'road', 'traffic', 'big', 'data.']
abstract\29.txt

['In', 'paper,', 'describe', 'ongoing', 'research', 'aim', 'define', 'Requirements', 'Engineering', 'Artefact', 'Model', '(REAM)', 'context', 'Big', 'Data', 'software', 'applications.', 'This', 'model', 'aim', 'provide', '“big', 'picture”', 'Requirements', 'Engineering', 'work', 'product', 'create', 'use', 'Big', 'Data', 'software', 'development', 'projects.', 'REAM', 'important', 'tool', 'use', 'reference', 'definition', 'domain-specific', 'RE', 'models,', 'system', 'life-cycle', 'process', 'artefact-centered', 'processes,', 'currently', 'bereft', 'Big', 'Data', 'Software', 'Engineering', 'research.']
abstract\3.txt

['Decision', 'tree', '

abstract\39.txt

['Centers', 'Medicare', 'Medicaid', 'Services', '(CMS)', 'publishes', 'Medicare', 'Part', 'C', 'Star', 'Ratings', 'year', 'measure', 'quality', 'care', 'Medicare', 'Advantage', '(MA)', 'contracts.', 'One', 'key', 'measure', 'Complaints', 'Health', 'Plan,', 'capture', 'Complaints', 'Tracking', 'Module', '(CTM).', 'Complaints', 'result', 'CTM', 'rare', 'events:', 'MA', 'contract', '2-5', 'star', 'ratings,', 'number', 'complaint', 'every', '1,000', 'member', 'range', '.10', '1.84', 'last', '5', 'years.', 'Reducing', 'number', 'complaint', 'extremely', 'important', 'MA', 'plan', 'impact', 'CMS', 'reimbursement', 'MA', 'plans.', 'Forecasting', 'reduce', 'complaint', 'extremely', 'technically', 'challenge', 'task,', 'involves', 'ethic', 'consideration', "patients'", 'right', 'privacy.', 'In', 'research,', 'construct', 'big', 'data', 'analytics', 'framework', 'forecasting', 'rare', 'customer', 'complaints.', 'First,', 'built', 'big', 'data', 'ingestion', 'pipeline', 'Hadoop',

abstract\46.txt

['Big', 'Data', 'constitutes', 'opportunity', 'company', 'empower', 'analysis.', 'However,', 'moment', 'standard', 'way', 'approach', 'Big', 'Data', 'projects.', 'This,', 'couple', 'complex', 'nature', 'Big', 'Data,', 'cause', 'many', 'Big', 'Data', 'project', 'fail', 'rarely', 'obtain', 'expect', 'return', 'investment.', 'In', 'paper,', 'present', 'methodology', 'tackle', 'Big', 'Data', 'project', 'systematic', 'way,', 'avoid', 'aforementioned', 'problems.', 'To', 'end,', 'review', 'state', 'art,', 'identify', 'prominent', 'problem', 'surround', 'Big', 'Data', 'projects,', 'best', 'practice', 'methods.', 'Then,', 'define', 'methodology', 'describe', 'step', 'step', 'technique', 'could', 'apply', 'combine', 'order', 'tackle', 'problem', 'identify', 'increase', 'success', 'rate', 'Big', 'Data', 'projects.']
abstract\47.txt

['In', 'August', '2015,', 'new', 'seafloor', 'observatory', 'deployed', 'Galway', 'Bay,', 'Ireland.', 'The', 'sensor', 'observatory', 'platform', 'c

abstract\a11.txt

['Support', 'vector', 'machine', '(SVM)', 'become', 'popular', 'tool', 'pattern', 'recognition', 'recent', 'year', 'outstanding', 'learn', 'performance.', 'When', 'deal', 'large-scale', 'learn', 'problems,', 'incremental', 'SVM', 'framework', 'generally', 'use', 'SVM', 'summarize', 'data', 'space', 'concise', 'way.', 'This', 'paper', 'proposes', 'training', 'algorithm', 'incremental', 'SVM', 'recombine', 'method.', 'Considering', 'difference', 'data', 'distribution', 'impact', 'new', 'training', 'data', 'history', 'data,', 'history', 'training', 'dataset', 'new', 'training', 'one', 'divide', 'independent', 'group', 'recombine', 'train', 'classifier.', 'In', 'fact,', 'method', 'implement', 'parallel', 'structure', 'action', 'divide', 'may', 'decrease', 'computation', 'complexity', 'training', 'SVM.', 'Meanwhile,', 'action', 'recombine', 'may', 'weaken', 'potential', 'impact', 'cause', 'difference', 'data', 'distribution.', 'The', 'experiment', 'result', 'text', 'datase

abstract\a20.txt

['Routing', 'optical,', 'especially', 'wavelength', 'division', 'multiplexing', 'networks,', 'hard', 'task.', 'This', 'paper', 'defines', 'new', 'rout', 'algorithm,', 'base', 'Hopfield', 'neural', 'network.', 'It', 'improvement', 'previous', 'research,', 'apply', 'optical', 'communication']
abstract\a21.txt

['The', 'information', 'storage', 'mechanism', 'biological', 'neural', 'network', 'important', 'problem', 'neuroscience.', 'Our', 'observation', 'show', 'structure', 'hippocampus,', 'core', 'memory', 'brain,', 'similar', "Hopfield's", 'neural', 'network.', 'An', 'electronic', 'neuronic', 'model', 'construct', 'simulate', 'dynamic', 'process', 'hippocampal', 'LTP', 'process.', 'The', 'kinetic', 'process', 'post', 'synaptic', 'potential', 'synaptic-synaptic', 'interaction', 'equation', 'also', 'discussed.', 'We', 'propose', 'dual', 'cod', 'theory', 'biological', 'neural', 'information', 'assume', 'messy', 'fiber', 'synaptic', 'glomerulus', 'may', 'chief', 'storage',

abstract\a30.txt

['Summary', 'form', 'given.', 'In', 'area', 'brain,', 'numerous', 'neuron', 'constitute', 'elaborate', 'networks.', 'These', 'network', 'link', 'numerous', 'connections,', 'compose', 'large-scaled', 'neural', 'systems.', 'The', 'neural', 'system', 'generate', 'major', 'brain', 'function', 'movement,', 'cognition,', 'emotion,', 'memory-learning.', 'The', 'brain', 'extensively', 'study', 'anatomically,', 'physiologically', 'chemically,', 'knowledge', 'ever', 'grows', 'cover', 'every', 'detail', 'brain.', 'Important', 'principle', 'activity-dependent', 'synaptic', 'plasticity,', 'multilayered', 'integration', 'neuronal', 'networks,', 'modular', 'organization', 'brain', 'tissue', 'revealed.', 'Yet,', 'shortage', 'knowledge', 'obvious', 'one', 'try', 'reproduce', 'brain', 'function', 'models.', 'While', 'simple', 'perceptron', 'model,', 'adaptive', 'filter', 'model,', 'feedforward', 'adaptive', 'control', 'system', 'model', 'successfully', 'reproduce', 'function', 'cerebel

abstract\a37.txt

['The', 'recent', 'development', 'learn', 'deep', 'representation', 'demonstrate', 'wide', 'application', 'traditional', 'vision', 'task', 'like', 'classification', 'detection.', 'However,', 'little', 'investigation', 'could', 'build', 'deep', 'learn', 'framework', 'weakly', 'supervise', 'setting.', 'In', 'paper,', 'attempt', 'model', 'deep', 'learn', 'weakly', 'supervise', 'learn', '(multiple', 'instance', 'learning)', 'framework.', 'In', 'setting,', 'image', 'follow', 'dual', 'multi-instance', 'assumption,', 'object', 'proposal', 'possible', 'text', 'annotation', 'regard', 'two', 'instance', 'sets.', 'We', 'thus', 'design', 'effective', 'system', 'exploit', 'MIL', 'property', 'deep', 'learn', 'strategy', 'two', 'ends;', 'also', 'try', 'jointly', 'learn', 'relationship', 'object', 'annotation', 'proposals.', 'We', 'conduct', 'extensive', 'experiment', 'prove', 'weakly', 'supervise', 'deep', 'learn', 'framework', 'achieves', 'convincing', 'performance', 'vision', 'tas

abstract\a45.txt

['In', 'paper,', 'introduce', 'distribute', 'deep', 'learn', 'platform,', 'BAIPAS,', 'Big', 'Data', 'AI', 'base', 'Predication', 'Analysis', 'System.', 'In', 'case', 'deep', 'learn', 'use', 'big', 'data,', 'take', 'much', 'time', 'train', 'data.', 'To', 'reduce', 'training', 'time,', 'method', 'us', 'distribute', 'deep', 'learning.', 'When', 'big', 'data', 'exists', 'external', 'storage,', 'training', 'take', 'long', 'time', 'take', 'lot', 'network', 'I/O', 'time', 'data', 'load', 'deep', 'learn', 'operations.', 'We', 'propose', 'data', 'locality', 'management', 'way', 'reduce', 'training', 'time', 'big', 'data.', 'BAIPAS', 'distribute', 'deep', 'learn', 'platform', 'aim', 'provide', 'quick', 'learn', 'big', 'data,', 'easy', 'installation', 'monitoring', 'platform,', 'convenience', 'developer', 'deep', 'learn', 'models.', 'In', 'order', 'provide', 'fast', 'training', 'use', 'big', 'data,', 'data', 'distribute', 'store', 'worker-server', 'storage', 'use', 'data', 'loca

abstract\a53.txt

['Hand', 'gesture', 'recognition', 'one', 'major', 'research', 'area', 'field', 'Human', 'computer', 'interaction', '(HCl).', 'This', 'paper', 'proposes', 'deep', 'reinforcement', 'learn', 'algorithm', 'recognize', 'human', 'arm', 'movement', 'pattern', 'use', 'IoT', 'sensor', 'device.', 'Recent', 'study', 'explore', 'supervise', 'learn', 'base', 'methods,', 'CNN', 'RNN', 'implement', 'HCl', 'device.', 'On', 'hand,', 'deep', 'reinforcement', 'learn', 'approach', 'also', 'investigated.', 'Algorithms', 'use', 'approach,', 'learn', 'pattern', 'sensor', 'use', 'reward', 'feedback', 'class', 'labels.', 'This', 'allows', 'user', 'control', 'IoT', 'device', 'produce', 'desire', 'arm', 'movement', 'pattern', 'without', 'create', 'labels.', 'In', 'paper,', 'performance', 'convolutional', 'neural', 'network', '(CNN)', 'DQN', 'model', 'compare', 'long', 'short-term', 'memory', '(LSTM)', 'model', 'DQN.', 'Results', 'show', 'CNN', 'base', 'DQN', 'model', 'stable', 'compare', 'LSTM

abstract\a63.txt

['Summary', 'form', 'given,', 'follows.', 'A', 'Hopfield', 'model', 'neural', 'network', 'useful', 'form', 'parallel', 'computer.', 'Such', 'neural', 'network', 'may', 'capable', 'arrive', 'problem', 'solution', 'much', 'speed', 'conventional,', 'sequential', 'approaches.', 'This', 'concept', 'apply', 'problem', 'generate', 'control', 'bit', 'multistage', 'interconnection', 'network.', 'A', 'Hopfield', 'model', 'neural', 'network', 'design', 'capable', 'rout', 'set', 'messages.', 'This', 'neural', 'network', 'solution', 'especially', 'useful', 'interconnection', 'network', 'self-routing', 'interconnection', 'network', 'irregular', 'structure.', 'Furthermore,', 'neural', 'network', 'rout', 'scheme', 'fault-tolerant.', 'Results', 'obtain', 'generate', 'route', '4*4', 'Benes', 'interconnection', 'network.<>']
abstract\a64.txt

['A', 'disturbance-rejection', 'neural', 'network', 'control', 'scheme', 'present', 'control', 'unknown', 'nonlinear', 'plant.', 'In', 'scheme,', 

abstract\a72.txt

['A', 'USAF', 'sponsor', 'MITRE', 'research', 'team', 'undertook', 'four', 'separate,', 'domain-specific', 'case', 'study', 'Big', 'Data', 'applications.', 'Those', 'case', 'study', 'initial', 'investigation', 'question', 'whether', 'data', 'quality', 'issue', 'encounter', 'Big', 'Data', 'collection', 'substantially', 'different', 'cause,', 'manifestation,', 'detection', 'data', 'quality', 'issue', 'encounter', 'traditionally', 'size', 'data', 'collections.', 'The', 'study', 'address', 'several', 'factor', 'affect', 'Big', 'Data', 'Quality', 'multiple', 'levels,', 'include', 'collection,', 'processing,', 'storage.', 'Though', 'unexpected,', 'key', 'finding', 'study', 'reinforce', 'primary', 'factor', 'affect', 'Big', 'Data', 'reside', 'limitation', 'complexity', 'involve', 'handle', 'Big', 'Data', 'maintain', 'integrity.', 'These', 'concern', 'high', 'magnitude', 'provenance', 'data,', 'processing,', 'tool', 'use', 'prepare,', 'manipulate,', 'store', 'data.', 'Data', 

abstract\a8.txt

['A', 'new', 'method', 'early', 'fault', 'diagnosis', 'manufacturing', 'system', 'base', 'machine', 'learn', 'presented.', 'It', 'necessary', 'manufacturing', 'enterprise', 'detect', 'state', 'production', 'process', 'real', 'time,', 'order', 'find', 'early', 'fault', 'machines,', 'loss', 'production', 'failure', 'investment', 'facility', 'maintenance', 'minimized.', 'This', 'paper', 'proposes', 'new', 'fault', 'diagnosis', 'model,', 'extract', 'multi-dimension', 'feature', 'detect', 'signal', 'supervise', 'different', 'feature', 'signal', 'simultaneously.', 'Based', 'model,', 'method', 'inductive', 'learn', 'adopt', 'obtain', 'statistical', 'boundary', 'vector', 'signal', 'automatically,', 'normal', 'feature', 'space', 'built,', 'accord', 'abnormal', 'signal', 'detected,', 'consequently', 'fault', 'complicate', 'system', 'found', 'easily.', 'Furthermore,', 'condition', 'without', 'exist', 'fault', 'samples,', 'precise', 'result', 'fault', 'diagnosis', 'also', 'achieve

abstract\a87.txt

['Artificial', 'intelligence,', 'particularly', 'machine', 'learning,', 'use', 'many', 'way', 'research', 'community', 'turn', 'variety', 'diverse', 'even', 'heterogeneous', 'data', 'source', 'high', 'quality', 'fact', 'knowledge,', 'provide', 'premier', 'capability', 'accurate', 'pattern', 'discovery.', 'However,', 'apply', 'machine', 'learn', 'strategy', 'big', 'complex', 'datasets', 'computationally', 'expensive,', 'consumes', 'large', 'amount', 'logical', 'physical', 'resources,', 'data', 'file', 'space,', 'CPU,', 'memory.', 'A', 'sophisticated', 'platform', 'efficient', 'big', 'data', 'analytics', 'become', 'important', 'day', 'data', 'amount', 'generate', 'daily', 'basis', 'exceeds', 'quintillion', 'bytes.', 'Apache', 'Spark', 'MLlib', 'one', 'prominent', 'platform', 'big', 'data', 'analysis', 'offer', 'set', 'excellent', 'functionality', 'different', 'machine', 'learn', 'task', 'range', 'regression,', 'classification,', 'dimension', 'reduction', 'cluster', 'rul

In [5]:
cou, Tags = read_tags(tags_directory)
#df = pd.DataFrame(Tags)
Words = np.array(Words)
# exploring frequency of all words not in heading
import nltk
freq = nltk.FreqDist(all_words + all_words_heading)
common = freq.most_common(3000)
common = list(common)

features = []
features += [w[0] for w in common]
features += [w for w in all_words_heading if w not in common]
print(len(common))
print(len(features))


3000
4147


In [6]:


variable_code, ctr = read_files(words_dir, tags_directory)
columns = list(Tags)
rows = list(range(ctr))
dataFrame = np.zeros((len(rows), len(columns)))
#df = tag_updater(dataFrame, variable_code, columns)

df = pd.DataFrame(data = dataFrame, index = rows, columns = columns, dtype='int64')
# print(variable_code)
Y = tag_updater(df, variable_code)
#print(df.at[1, ' Machine learning '])
# for x in df['Computers']:
#     print(x)

display(Y)
Y.fillna(0, inplace=True)

X = extract_features(Words, features)
X_train = pd.DataFrame(data = X, index = rows, columns = features, dtype='int64')


# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(X_train, Y)
# predict
#predictions = classifier.predict(X_train)
#print(predictions.toarray())
#print(accuracy_score(Y, predictions))

Unnamed: 0,Hilbert spaces,Fuzzy control,ï»¿Handheld computers,Benes interconnection network,Predictive models,output units,Artificial Neural Networ (ANN),flexible structures,data analytics problems,accident risk determination,...,Application software,Optical computing,Accelerometers,maximal margin classification,data sharing,discriminative loss function optimization,Business,video frames,Semantics,deep neural network
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[[1. 5. 0. ... 0. 0. 0.]
 [4. 7. 3. ... 0. 0. 0.]
 [6. 3. 0. ... 0. 0. 0.]
 ...
 [7. 5. 0. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 0.]
 [0. 1. 4. ... 3. 3. 1.]]


BinaryRelevance(classifier=GaussianNB(priors=None, var_smoothing=1e-09),
        require_dense=[True, True])

In [7]:
words_dir = 'testing'
Words, waste1, waste2 = read_words(words_dir)
X = extract_features(Words, features)
rows = list(range(1))
X_test = pd.DataFrame(data = X, columns = features, dtype='int64')


[[0. 0. 0. ... 0. 0. 0.]]


In [8]:
display(X_test)
predictions = classifier.predict(X_test)

Unnamed: 0,data,learn,The,use,neural,big,machine,Data,deep,network,...,Composition,ï»¿A,method,choose,kernel,function,parameters,support,vector,machines
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
display(predictions)
predictions.shape[0]

<1x1774 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Column format>

1

In [10]:
from scipy.sparse import csc_matrix
print(type(predictions))
a = predictions.nonzero()
#a.row[a.data]

<class 'scipy.sparse.csc.csc_matrix'>


In [11]:
def find_index(self):
    A = self.tocoo()
    nz_mask = A.data != 0
    return (list(A.col[nz_mask]))

In [12]:
print((predictions.shape[0], len(columns)))
dataFrame5 = np.zeros((predictions.shape[0], len(columns)))
ct = 0
for i in predictions:
    b = find_index(i)
    for j in b:
        dataFrame5[ct,j] +=1
    ct+=1
print(dataFrame5)
for i in dataFrame5:
    print(len(i))

(1, 1774)
[[0. 0. 0. ... 0. 0. 0.]]
1774


In [13]:
print(type(dataFrame))

converting = pd.DataFrame(data = dataFrame5, columns = columns, dtype='int64')
display(converting)

<class 'numpy.ndarray'>


Unnamed: 0,Hilbert spaces,Fuzzy control,ï»¿Handheld computers,Benes interconnection network,Predictive models,output units,Artificial Neural Networ (ANN),flexible structures,data analytics problems,accident risk determination,...,Application software,Optical computing,Accelerometers,maximal margin classification,data sharing,discriminative loss function optimization,Business,video frames,Semantics,deep neural network
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df.to_csv('file_predicted_skills.csv', sep='\t', encoding='utf-8')

In [15]:
we_have = []
print('------------------------TECHNICAL SKILLS REQUIRED-----------------------------')
for i in b:
    print(columns[i])
    we_have.append(columns[i])
    
#we_have.append('artificial intelligence')
#we_have.append('Machine learning algorithms')
#we_have.append('Support vector machines')
#we_have.append('Kernel')

print(we_have)

------------------------TECHNICAL SKILLS REQUIRED-----------------------------
 Machine learning algorithms 
 Computer science 
 Classification algorithms 
 Training data 
 Learning systems 
 Educational institutions 
 Support vector machines 
 Support vector machine classification 
 Machine learning 
 Data models 
[' Machine learning algorithms ', ' Computer science ', ' Classification algorithms ', ' Training data ', ' Learning systems ', ' Educational institutions ', ' Support vector machines ', ' Support vector machine classification ', ' Machine learning ', ' Data models ']


In [16]:
#this is keyword extractor

In [17]:
#next comes the wiki reader

In [18]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import nltk
import pickle


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''


def load_skill_set():

    skill_dict = dict()
    pick_file = open("final_skill_dict.pickle", "rb")
    skill_dict = pickle.load(pick_file)
    pick_file.close()
    return skill_dict


skill_dict = load_skill_set()


def get_skills(skill_dic,topic):

    # print("getting skill set for ",topic)
    skill_set = set()
    for word in topic.split(' '):
        # print("for word=",word)
        if word.strip().isalpha():
            word = word.strip().lower()
            wordnet_pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
            if wordnet_pos == '':
                word = lemmatizer.lemmatize(word)
            else:
                word = lemmatizer.lemmatize(word, wordnet_pos)

            try:
                curr_skill_set = skill_dic[word]
                # print("skill set=",curr_skill_set)
                if len(skill_set) == 0:
                    skill_set = curr_skill_set
                else:
                    skill_set = skill_set.intersection(curr_skill_set)

                #print("intersection=",skill_set)
            except KeyError as e:
                print("no skill set found",e)

    return skill_set


print(get_skills(skill_dict,'opportunity'))


{'identification and evaluation of market opportunities', 'opportunity management', 'identifying and seizing fast breaking opportunities', 'evaluation of commercial opportunities'}


In [19]:
import urllib.request
from bs4 import BeautifulSoup


def get_page(topic):
    domain = "https://en.wikipedia.org"
    html = urllib.request.urlopen("https://en.wikipedia.org/w/index.php?search="+topic.replace(' ','+')+"&title=Special%3ASearch&go=Go")
    soup = BeautifulSoup(html, features="lxml")
    first_result = soup.find(attrs={"data-serp-pos": "0"})
    if first_result is None:
        print('page-found')
        return soup
    href = first_result.get('href')
    print('opening first-result')
    html = urllib.request.urlopen(domain+href)
    soup = BeautifulSoup(html, features="lxml")
    return soup


def get_first_para(topic):
    soup = get_page(topic)
    text_section = soup.find(attrs={'class': 'mw-parser-output'})
    text = ''
    for child in text_section.children:
        # print('for tag', child.name, child)
        try:
            if child is not None:
                if child.name == 'p':
                    text += child.text.lower()
                elif child.name == 'div' and 'toc' in child['class']:
                    break
        except Exception as e:
            print("exception", e)
    return text

vo_set = set()

In [20]:
import re
# cleaning we _have 
dore = []
for x in we_have:
    regex = re.compile('[^a-zA-Z]')
    #First parameter is the replacement, second parameter is your input string
    #regex.sub('', x)
    dore.append(re.sub("[^a-zA-Z_ ]*", "", x))

for y in dore:
    print(y)

 Machine learning algorithms 
 Computer science 
 Classification algorithms 
 Training data 
 Learning systems 
 Educational institutions 
 Support vector machines 
 Support vector machine classification 
 Machine learning 
 Data models 


In [21]:
import csv
with open('tech2.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(dore)

In [22]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize


if __name__ == '__main__':

    #print(get_first_para('computer vision'))
    for original in dore: 
        print(original)
        w1=get_first_para(original)
        w2=word_tokenize(w1)
        
        print(w1)

        for word in w2:                                                                 
            if word not in stop and word.__len__()>3 and word.isalpha():
                w3 = get_skills(skill_dict, word)
                for x in w3:
                    vo_set.add(x)
                
                


 Machine learning algorithms 
page-found
the following outline is provided as an overview of and topical guide to machine learning. machine learning is a subfield of soft computing within computer science that evolved from the study of pattern recognition and computational learning theory in artificial intelligence.[1] in 1959, arthur samuel defined machine learning as a "field of study that gives computers the ability to learn without being explicitly programmed".[2] machine learning explores the study and construction of algorithms that can learn from and make predictions on data.[3] such algorithms operate by building a model from an example training set of input observations in order to make data-driven predictions or decisions expressed as outputs, rather than following strictly static program instructions.
subfields of machine learning
cross-disciplinary fields involving machine learning
applications of machine learning
machine learning hardware
machine learning tools   (list)
ma

no skill set found 'computer'
no skill set found 'emphasizes'
no skill set found 'application'
no skill set found 'considers'
no skill set found 'description'
no skill set found 'computational'
no skill set found 'computer'
no skill set found 'involves'
no skill set found 'interaction'
no skill set found 'considers'
no skill set found 'challenge'
no skill set found 'computer'
no skill set found 'usable'
no skill set found 'accessible'
no skill set found 'early'
no skill set found 'foundation'
no skill set found 'would'
no skill set found 'become'
no skill set found 'computer'
no skill set found 'science'
no skill set found 'predate'
no skill set found 'invention'
no skill set found 'modern'
no skill set found 'computer'
no skill set found 'machine'
no skill set found 'calculate'
no skill set found 'fix'
no skill set found 'numerical'
no skill set found 'abacus'
no skill set found 'exist'
no skill set found 'since'
no skill set found 'antiquity'
no skill set found 'computation'
no skill

no skill set found 'computer'
no skill set found 'usage'
no skill set found 'expert'
no skill set found 'initially'
no skill set found 'computer'
no skill set found 'quite'
no skill set found 'costly'
no skill set found 'degree'
no skill set found 'humanitarian'
no skill set found 'efficient'
no skill set found 'part'
no skill set found 'computer'
no skill set found 'operator'
no skill set found 'computer'
no skill set found 'adoption'
no skill set found 'become'
no skill set found 'widespread'
no skill set found 'affordable'
no skill set found 'less'
no skill set found 'assistance'
no skill set found 'usage'
no skill set found 'despite'
no skill set found 'history'
no skill set found 'formal'
no skill set found 'academic'
no skill set found 'discipline'
no skill set found 'computer'
no skill set found 'science'
no skill set found 'number'
no skill set found 'contribution'
no skill set found 'science'
no skill set found 'fact'
no skill set found 'along'
no skill set found 'electronics'

no skill set found 'computation'
no skill set found 'general'
no skill set found 'principal'
no skill set found 'specific'
no skill set found 'computation'
no skill set found 'practical'
no skill set found 'complementary'
no skill set found 'discipline'
no skill set found 'academic'
no skill set found 'computer'
no skill set found 'science'
no skill set found 'tend'
no skill set found 'depend'
no skill set found 'whether'
no skill set found 'department'
no skill set found 'form'
no skill set found 'mathematical'
no skill set found 'computer'
no skill set found 'science'
no skill set found 'department'
no skill set found 'mathematics'
no skill set found 'numerical'
no skill set found 'orientation'
no skill set found 'consider'
no skill set found 'computational'
no skill set found 'science'
no skill set found 'type'
no skill set found 'department'
no skill set found 'tend'
no skill set found 'effort'
no skill set found 'bridge'
no skill set found 'field'
no skill set found 'educationally

no skill set found 'mean'
no skill set found 'usually'
no skill set found 'safety'
no skill set found 'security'
no skill set found 'utmost'
no skill set found 'importance'
no skill set found 'formal'
no skill set found 'described'
no skill set found 'application'
no skill set found 'fairly'
no skill set found 'broad'
no skill set found 'variety'
no skill set found 'theoretical'
no skill set found 'computer'
no skill set found 'science'
no skill set found 'particular'
no skill set found 'logic'
no skill set found 'calculus'
no skill set found 'formal'
no skill set found 'automaton'
no skill set found 'semantics'
no skill set found 'also'
no skill set found 'type'
no skill set found 'algebraic'
no skill set found 'type'
no skill set found 'hardware'
no skill set found 'specification'
no skill set found 'computer'
no skill set found 'computer'
no skill set found 'computer'
no skill set found 'largely'
no skill set found 'central'
no skill set found 'processing'
no skill set found 'unit'


no skill set found 'processing'
no skill set found 'mark'
no skill set found 'sensitive'
no skill set found 'card'
no skill set found 'paper'
no skill set found 'tape'
no skill set found 'usually'
no skill set found 'select'
no skill set found 'student'
no skill set found 'produce'
no skill set found 'classroom'
no skill set found 'computer'
no skill set found 'become'
no skill set found 'student'
no skill set found 'computer'
no skill set found 'science'
no skill set found 'student'
no skill set found 'importance'
no skill set found 'recognise'
no skill set found 'become'
no skill set found 'compulsory'
no skill set found 'part'
no skill set found 'national'
no skill set found 'curriculum'
no skill set found 'september'
no skill set found 'become'
no skill set found 'entitlement'
no skill set found 'pupil'
no skill set found 'school'
no skill set found 'district'
no skill set found 'decide'
no skill set found 'curriculum'
no skill set found 'provision'
no skill set found 'fracture'
no

no skill set found 'variable'
no skill set found 'independent'
no skill set found 'variable'
no skill set found 'regressors'
no skill set found 'predict'
no skill set found 'know'
no skill set found 'outcome'
no skill set found 'consider'
no skill set found 'possible'
no skill set found 'dependent'
no skill set found 'variable'
no skill set found 'machine'
no skill set found 'observation'
no skill set found 'often'
no skill set found 'know'
no skill set found 'instance'
no skill set found 'explanatory'
no skill set found 'variable'
no skill set found 'feature'
no skill set found 'grouped'
no skill set found 'feature'
no skill set found 'vector'
no skill set found 'possible'
no skill set found 'predict'
no skill set found 'class'
no skill set found 'field'
no skill set found 'different'
no skill set found 'terminology'
no skill set found 'ecology'
no skill set found 'classification'
no skill set found 'normally'
no skill set found 'refers'
no skill set found 'cluster'
no skill set found

page-found
an educational institution is a place where people of different ages gain an education.[1] examples of some institutions are preschools, primary schools, secondary schools, and further and higher education. they provide a large variety of learning environments and learning spaces. the institution can be public, private or unconventional.
the american educational system typically divides learning facilities by an age grade system. students are designated to a grade level based on their age, advancing one grade each year. they are required to learn and do tasks at this level or they will be set back a grade. this designation determines what educational institution would be an appropriate setting for the individual student.

no skill set found 'educational'
no skill set found 'institution'
no skill set found 'place'
no skill set found 'different'
no skill set found 'age'
no skill set found 'gain'
no skill set found 'education'
no skill set found 'institution'
no skill set found

no skill set found 'possible'
no skill set found 'unsupervised'
no skill set found 'require'
no skill set found 'natural'
no skill set found 'cluster'
no skill set found 'form'
no skill set found 'cluster'
no skill set found 'algorithm'
no skill set found 'hava'
no skill set found 'siegelmann'
no skill set found 'vladimir'
no skill set found 'vapnik'
no skill set found 'applies'
no skill set found 'support'
no skill set found 'vector'
no skill set found 'developed'
no skill set found 'support'
no skill set found 'vector'
no skill set found 'machine'
no skill set found 'algorithm'
no skill set found 'categorize'
no skill set found 'unlabeled'
no skill set found 'widely'
no skill set found 'cluster'
no skill set found 'algorithm'
no skill set found 'industrial'
no skill set found 'application'
no skill set found 'citation'
 Machine learning 
page-found
machine learning (ml) is the scientific study of algorithms and statistical models that computer systems use to effectively perform a spe

In [23]:
vo_set

{'risk assessments',
 'conducting procurement',
 'information architecture',
 'direct marketing',
 'technology scope creep management (feature creep)',
 'preserving and enhancing reputation',
 'program management plans',
 'product concept design',
 'financial analysis for program decisions',
 'strategic cost management',
 'self confidence',
 'scientific diagrams',
 'sales force management',
 'strategic decision making',
 'information analysis',
 'using project management tools',
 'mentoring & coaching',
 'recognizing employee performance',
 'communications',
 'advertising strategy and management',
 'organization',
 'managing large accounts',
 'diagrams',
 'diplomacy techniques',
 'tactical decision making',
 'managing productivity',
 'developing project management plans',
 'program priorities',
 'social proof',
 'politics',
 'managing stress',
 'corporate budgets and financial statements',
 'scope management',
 'niche marketing',
 'introducing yourself',
 'risk impact analysis',
 'lead

In [24]:
from nltk.stem import WordNetLemmatizer
import pickle


lemmatizer = WordNetLemmatizer()


def load_skill_set2():

    skill_dict = dict()
    pick_file = open("final_skill_dict_2.pickle", "rb")
    skill_dict = pickle.load(pick_file)
    pick_file.close()
    return skill_dict


def get_skill2(skill_dic,topic):
    return skill_dic[ lemmatizer.lemmatize(topic.strip()) ]


# skill_dict = make_skill_dict()
skill_dict2 = load_skill_set2()

print(get_skill2(skill_dict2,'procurement management'))



facilitation skills


In [25]:

print(len(vo_set))

1037


In [26]:
new_set = set()
for i in vo_set:
    if i in skill_dict2:
        new_set.add(get_skill2(skill_dict2,i))

len(new_set)

114

In [27]:
print('--------------NON- TECHNICAL SKILLS---------------')
for word in new_set:
    print(word)
my_list2 = list(new_set)

--------------NON- TECHNICAL SKILLS---------------
legal & compliance
observing & analyzing
influencing to negotiate
hard bargaining
business & product development
idea formation
leadership of visual communication
persuasion techniques
scope management
sale
legal, risk & compliance
negotiation skills
quality management
program lifecycle
marketing
operation
reporting & communication
perceiving emotions
sales strategy
time management
technology
sales pipeline management
information visualization
targeted communication
negotiation
communication skills
mba skills
organizational behavior
marketing & sales
conflict resolution
personal skills
visual abilities
basic leadership skills
procurement management
artistic abilities
core executive leadership skills
executive management
presentation skills
core skills
managing teams
brand management
problem solving
technology (for it project management)
establishing rapport
interpersonal skills
innovation
emotional intelligence
marketing approaches
ent

In [28]:
import csv
with open('nontech2.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(my_list2)

In [29]:
# trying out reading just the extract for the technical skills
import os
files = [os.path.join('testing', fi) for fi in os.listdir('testing')]
my_text = []
from nltk.tokenize import word_tokenize
for fil in files:
    with open(fil) as fi:
            line = fi.read()
            temp= word_tokenize(line)

In [30]:
temp2 = list(temp)
my_text =  [w.lower() for w in temp2 if not w.lower() in stop]

In [31]:
my_text

['support', 'vector', 'machines']

In [32]:
q_set = set()
for q in my_text:
    if q in skill_dict:
        veu = get_skills(skill_dict,q)
        for qw in veu:
            q_set.add(qw)

In [33]:
for qw in q_set:
    print(qw)

In [34]:
len(q_set)

0

In [35]:
qq_set = set()
for i in q_set:
    qq_set.add(get_skill2(skill_dict2,i))

In [36]:
temp = []
for ele in qq_set:
    temp = temp + (word_tokenize(ele))

In [37]:
dict123 = {}
my_final = {}
my_final['others'] = set()
print(temp)
for i in temp:
    if i in dict123:
        dict123[i] = dict123[i]+1
    else:
        dict123[i] = 1
for ele in qq_set:
    w = word_tokenize(ele)
    for i in w:
        if dict123[i]>5:
            if i in my_final:
                my_final[i].add(ele)
            else:
                my_final[i] = set()
                my_final[i].add(ele)
        if dict123[i]<2:
            my_final['others'].add(ele)
len(qq_set)
len(my_final)

[]


1

In [38]:
my_final

{'others': set()}

In [39]:
### --------    Anupams dictionary  --------  ###


# from nltk.stem import WordNetLemmatizer
# import pickle
# import xlrd


# lemmatizer = WordNetLemmatizer()

# # print(sheet.cell_value(0,0))


# def make_skill_dict_final():

#     file = ("test_files/skills_anupam.xlsx")

#     wb = xlrd.open_workbook(file)
#     sheet = wb.sheet_by_index(0)

#     # print(len(lines))
#     skill_dict = {}

#     root_skill = ''
#     for i in range(sheet.nrows):
#         if sheet.cell_value(i,0) != '':
#             root_skill = lemmatizer.lemmatize(sheet.cell_value(i, 0).strip().lower())

#         for j in range(sheet.ncols):

#             if sheet.cell_value(i, j) != '':
#                 key = lemmatizer.lemmatize(sheet.cell_value(i, j).strip().lower())

#                 if key in skill_dict:
#                     skill_dict[key].add(root_skill)
#                 else:
#                     skill_dict[key] = {root_skill}

#     print("printing dict.......")
#     for key in skill_dict:
#         print(str(key).ljust(40) + str(skill_dict[key]).rjust(40))

#     pick_file = open("anupam_skill_dict.pickle","wb")
#     pickle.dump(skill_dict, pick_file)
#     pick_file.close()

#     return skill_dict


# def load_skill_set_final():

#     pick_file = open("anupam_skill_dict.pickle", "rb")
#     skill_dict = pickle.load(pick_file)
#     pick_file.close()
#     return skill_dict


# def get_skill_final(skill_dic,topic):
#     topic = lemmatizer.lemmatize(topic.strip().lower())
#     if topic in skill_dic:
#         return skill_dic[topic]
#     else:
#         return None


# # skill_dict = make_skill_dict()
# skill_dict_final = load_skill_set()

# print(get_skill_final(skill_dict_final,'motivate'))
# # while True:
# #     inp = input("topic:")
# #     print(get_skill(skill_dict,inp))

In [40]:
print('----------------Technical skills---------------')
for zzz in dore:
    print(zzz)
print()
print('----------------Technical skills---------------')
print()

for i in my_final:
    
    print('--------------------------------')
    print('             --', i , '--')
    print('---------------------------------')
    for j in my_final[i]:
        print(j, ',')
        

----------------Technical skills---------------
 Machine learning algorithms 
 Computer science 
 Classification algorithms 
 Training data 
 Learning systems 
 Educational institutions 
 Support vector machines 
 Support vector machine classification 
 Machine learning 
 Data models 

----------------Technical skills---------------

--------------------------------
             -- others --
---------------------------------
