In [2]:
from __future__ import unicode_literals, print_function

import pandas as pd
from collections import Counter
import json
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy import displacy

from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

import random
import re
from pprint import pprint
import os
import warnings
warnings.filterwarnings('ignore')

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [3]:
with open("NER_dataset/Ner_Train_Data.json", "r") as f:
    training_data = json.load(f)

training_data[0]

['Experience in commonly used for data analysis such as Python, R, Julia, or SAS.\r',
 {'entities': [[32, 45, 'SKILL'],
   [54, 60, 'SKILL'],
   [62, 63, 'SKILL'],
   [65, 70, 'SKILL'],
   [75, 78, 'SKILL']]}]

In [None]:
output_dir = 'ner_model_Skills_me/'
n_iter = 120
#######
nlp = spacy.load ( 'en_core_web_sm' )
print( nlp.pipe_names )

ner = nlp.create_pipe("ner")
nlp.replace_pipe(component=ner, name='ner')

# add labels
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# other_pipes = []

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
#     if nlp is None:
    nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(training_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print("Losses", losses)

# save model to output directory
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
        
# trained this model with 100 iterations in google colab using GPU runtime with loss: Losses {'ner': 465.24571515528373} 

['tagger', 'parser', 'ner']
Losses {'ner': 3513.0107858604156}
Losses {'ner': 2241.33879684698}
Losses {'ner': 1890.5465298151692}
Losses {'ner': 1698.7312965243946}
Losses {'ner': 1450.3776055092833}
Losses {'ner': 1433.0762409560195}
Losses {'ner': 1228.2183340281288}


# import NER model recognize SKILL words

In [4]:
nlp = spacy.load('ner_model_Skills_me/')

In [5]:
skill_set = {}

In [6]:
def remove_urls(vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

In [7]:
ind = 0
lower_skill_set = set()

for root,dirs,files in os.walk(r"../JD_files"):
    for file in files:
        ind += 1
#         try:
            
        my_file = open( os.path.join(root,file),encoding = 'utf-8' )
        my_string = my_file.readlines()

        doc_sentences = '\n'.join(my_string[1:])
        doc_sentences = remove_urls(doc_sentences)
        
        doc_sentences = doc_sentences.replace('www.','https://www.')
        doc_sentences = remove_urls(doc_sentences)
        
        doc_sentences = nlp(doc_sentences)
        
        doc_sentence_list = [str(sen).strip()  for sen in doc_sentences.sents if len(str(sen).strip()) > 0]

        file_list = []
        for tmp_sentences in doc_sentence_list:
            doc = nlp( tmp_sentences )

            temp_list = []
            for chunk in doc.ents:
                if chunk.root.ent_type_ == 'SKILL':
                    temp_list.append( chunk.text )

            file_list += temp_list

        for tem in list(set(file_list)):
            if tem.lower() not in skill_set.keys() and len(str(tem).strip()) > 0 :
                lower_skill_set.add( tem.lower() )
                
                if random.randint(0,300) == 0:
                    print( tem )

            if skill_set.get( tem, 0 ) == 0:
                skill_set[tem] = 1
            else:
                skill_set[tem] += 1

        my_file.close()
        if ind % 1000 == 0:
            print( '-' * 120, ind )
                
#         except:
#             pass

cloud-based data warehouses
network
relational database
massive datasets
PRIMARY DUTIES
advanced mathematical techniques
priority referrals
Microsoft Corporation
Hadoop
enterprise-grade
VBA
MS
Brand new office
CA
quantitative problem-solving
True North re-imagines
ENVI/IDL
MD/PhD
PROFESSIONAL
framework
SalesPlanning
Dell EMC
MR&A
Microsoft Office
warehouse(s
validate new features
Waste
Pig
Alpha
MATLAB
phone
General Motors
Django
Conducts research
Executive Orders
Brand new office
multi-discipline team
measure results
Linux
banking sectors
Corporate Technology
R
job qualifications
optimization and validation
USA
Comcast Corporation
data science community
Repair Reports
LDA
SQL
Oracle
MAPReduce On-site
CA
TechCrunch
ExperienceEducation
R
Ph.D.
Teradata
personal credibility
NIBR
SDLC
Microsoft Cloud
C++
SQL
NO PHONE
research proposals
SQL
CESR
CA
------------------------------------------------------------------------------------------------------------------------ 1000
North America
AWS

In [10]:
my_skill_set = dict( Counter(skill_set).most_common(300) )
my_skill_set

{'Python': 7645,
 'R': 5665,
 'machine learning': 5227,
 'SQL': 5034,
 'Data Scientist': 4213,
 'Computer Science': 3384,
 'Statistics': 3275,
 'color': 3195,
 'religion': 3147,
 'race': 3052,
 'data science': 3004,
 'statistics': 2974,
 'sexual orientation': 2803,
 'Spark': 2559,
 'Hadoop': 2508,
 'Job': 2382,
 'data analysis': 2339,
 'Machine Learning': 2331,
 'Data Science': 2254,
 'data mining': 2211,
 'Java': 2181,
 'Tableau': 2090,
 'sex': 2021,
 'disability': 2011,
 'SAS': 1979,
 'Mathematics': 1954,
 'analysis': 1831,
 'Engineering': 1831,
 'modeling': 1798,
 'origin': 1725,
 'receive consideration': 1645,
 'without regard': 1571,
 'statistical analysis': 1512,
 'computer science': 1503,
 'AWS': 1479,
 'predictive modeling': 1469,
 'analytics': 1405,
 'Scala': 1388,
 'AI': 1359,
 'age': 1322,
 'PhD': 1293,
 'statistical': 1288,
 'algorithms': 1281,
 'data scientist': 1261,
 'optimization': 1235,
 'communication skills': 1221,
 'Hive': 1220,
 'advanced analytics': 1114,
 'quanti

In [11]:
skill_set_lower = set([str(tem).lower() for tem in my_skill_set.keys()])

# ============================================================
## Hierarchical Relation Extract  base on Dependency analysis

In [12]:
delimiters = "？","?", "！","!",".","。","；",";","\n","\r","·","•"," -"," o"
regexPattern = '|'.join(map(re.escape, delimiters))
regexPattern

'？|\\?|！|!|\\.|。|；|;|\\\n|\\\r|·|•|\\ \\-|\\ o'

In [13]:
def is_father( word, root ):
    if word == root:
        return True
    
    while word != word.head:
        word = word.head
        if word == root:
            return True
    
    return False

In [14]:
def if_doc_id_is_left_punct( doc, my_id ):
    for ent in doc:
        if ent.idx == my_id and ent.is_left_punct:
            return True
    
    return False

In [15]:
def if_doc_id_is_right_punct( doc, my_id ):
    for ent in doc:
        if ent.idx == my_id and ent.is_right_punct:
            return True
    
    return False

In [16]:
def is_conj_or_appos( tem, word ):
    if tem == word:
        return False
    
    while tem.head != tem:
        if tem.head == word and (tem.dep_ == 'compound' or tem.dep_ == 'conj'):
            return True
        tem = tem.head
        
    return False

# Dependency analysis testing

In [17]:
def get_Date_attribute( string ):
    print( 'SENTENCE:',string )
    doc = nlp( string )

#     print( [ '[' + str(token) + ']' for token in doc.ents if token.root.ent_type_ == 'SKILL'] )
#     print( [ '[' + str(token) + ']' for token in doc.ents] )
    displacy.render(doc, style='dep', jupyter=True)

In [18]:
ss = 'Experience with large data sets and distributed computing (Hive/Hadoop) a plus'

ss = 'Experience with Hadoop stack (HIVE, Pig, Hadoop streaming) and/or Spark a plus'
ss = 'Build tools and support structures needed to analyze data, perform elements of data cleaning, feature selection and feature engineering and organize experiments in conjunction with best practices'
# ss = 'Strong programming skills (preferably with Python), and some C++ knowledge and experience'
# ss 
# ss = 'Proficiency in SQL and one of Python/R for programming, data analysis, and modelling'

# ss = 'We are looking for a very motivated technical analyst to help build the framework of data reporting and analysis for our licensing partners – Labels and Publishers'

ss = 'Experience with languages used for querying (like SQL/Hive)'
ss = 'Advanced knowledge of SAS, C ##, Java, Perl, Python, Scala and r programming'
ss = 'Three years’ experience working with one of the following programming languages: Python, Java, C#, Javascript, R'
ss = 'Web Frameworks (like Django, Flask)'

ss = 'Data Scientist role to work on model building and feature engineering for client protection AI/ML Projects'
ss = 'Machine learning algorithms (supervised, unsupervised learning)'
ss = 'Hadoop stack (HIVE, Pig, Hadoop streaming)'
# ss = 'We are looking for highly motivated Data Scientists with a strong background in applied statistics, Machine Learning and large scale data analysis'

# ss = 'js Basic proficiency in SQL Able to implement and utilize advanced statistical and machine learning techniques (GLMs, PCA, cluster analyses, ARIMA, ETS, decision trees, SVM, neural networks, ensemble methods, etc) Basic familiarity with project management tools (Asana, Basecamp, LiquidPlanner)'
get_Date_attribute(ss)

SENTENCE: Hadoop stack (HIVE, Pig, Hadoop streaming)


# testing on 300 sample

In [19]:
ind = 0

total_link_list = []
total_link_set = set()

for root,dirs,files in os.walk(r"../JD_files"):
    for file in files:
        ind += 1
        
        my_file = open( os.path.join(root,file),encoding = 'utf-8' )
        my_string = my_file.readlines()
        
        doc_sentences = '\n'.join(my_string[1:])
        doc_sentences = remove_urls(doc_sentences)
        
        doc_sentences = doc_sentences.replace('www.','https://www.')
        doc_sentences = remove_urls(doc_sentences)

        doc_sentences = doc_sentences.replace('e.g.','like').replace('i.e.','like').replace('.)',')')
        
        delimiters = "？","?", "！","!",".","。","；",";","\n","\r","·","•"," -","*"
        regexPattern = '|'.join(map(re.escape, delimiters))
        
        sentences = [str(sentence).strip() for sentence in re.split( regexPattern, doc_sentences ) if len(str(sentence).strip()) > 0]
        
        for string in sentences:
            if (string[0] == '-' or string[0] == '*') and len(string) > 1:
                string = string[1:]
            
            doc = nlp( str(string) )
            
            my_chunks_list = []
            roor_chunks_dict = {}

            for chunk in doc.ents:
                if chunk.root.ent_type_ == 'SKILL' : # and chunk.text.lower() in skill_set_lower
                    my_chunks_list.append( chunk.root )
                    roor_chunks_dict[ chunk.root ] = chunk
                    
            
            if len( my_chunks_list ) > 1:            
                Total_List = []
                for word in my_chunks_list:
                    #####################################################################################
                    ## prep --> pobj
                    if word.dep_ == 'pobj' and word.head.dep_ == 'prep':
                        root_word = word.head.head
                        
                        if root_word in my_chunks_list:
#                             print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[word.head.head] )
                            Total_List.append( ( word, root_word ) )
                    
                        if 'N' in word.tag_:
                            for tt in my_chunks_list:
                                if tt.head == root_word and (tt.dep_ == 'compound' or tt.dep_ == 'conj'):
    #                                     print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[tt] )
                                    Total_List.append( ( word, tt) )
        
                    
                    #####################################################################################
                    ## appos
                    elif word.dep_ == 'appos' and ( if_doc_id_is_left_punct( doc, word.idx-1 ) or if_doc_id_is_right_punct( doc, word.idx+1 ) ):
                        root_word = word.head
                        
                        if root_word in my_chunks_list:
#                             print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[word.head.head] )
                            Total_List.append( ( word, root_word ) )
                    
                        if 'N' in word.tag_:
                            for tt in my_chunks_list:
                                if tt.head == root_word and (tt.dep_ == 'compound' or tt.dep_ == 'conj'):
    #                                     print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[tt] )
                                    Total_List.append( ( word, tt) )
                    

                
                Total_List_expend = []
                for tup in Total_List:
                    for tem in my_chunks_list:
                        if is_conj_or_appos( tem, tup[0] ):
                            Total_List_expend.append( ( tem, tup[1] ) )
                    
                Total_List = Total_List + Total_List_expend
                
                print_flag = True
                for tup in Total_List:
                    if str(roor_chunks_dict[tup[0]]).lower() in skill_set_lower and str(roor_chunks_dict[tup[1]]).lower() in skill_set_lower and 'degree' not in str(roor_chunks_dict[tup[1]]).lower() and 'master' not in str(roor_chunks_dict[tup[1]]).lower() and 'phd' not in str(roor_chunks_dict[tup[1]]).lower() and 'ms' != str(roor_chunks_dict[tup[1]]).lower() and 'bs' != str(roor_chunks_dict[tup[1]]).lower():
                        if  str(roor_chunks_dict[tup[0]]).lower() + '-->' + str(roor_chunks_dict[tup[1]]).lower() not in total_link_set:       
                    
                            if print_flag:
                                print( '=' * 120 )
                                print( 'SENTENCE:', string )
#                                 print( 'my_chunks_list', [str(roor_chunks_dict[tem]) for tem in my_chunks_list] )
                                print()
                                print_flag = False

                            print( str(roor_chunks_dict[tup[0]]) + '-->' + str(roor_chunks_dict[tup[1]]) )
                            total_link_set.add( str(roor_chunks_dict[tup[0]]).lower() + '-->' + str(roor_chunks_dict[tup[1]]).lower() )
                            total_link_list.append( ( str(roor_chunks_dict[tup[0]]) , str(roor_chunks_dict[tup[1]]) ) )

        my_file.close()

        if ind % 500 == 0:
            print( '=' * 120 ,ind )
            
        if ind > 100:
            break

SENTENCE: Experience with predictive modeling (classification, regression, parameter tuning, feature selection, validation, performance reporting), preferably with multiple techniques

classification-->predictive modeling
regression-->predictive modeling
SENTENCE: Bachelor’s degree in Finance, Mathematics (Economics) or a closely related quantitative field

Economics-->Mathematics
Economics-->quantitative field
SENTENCE: Design, build and support algorithms of data transformation, conversion, computation on Hadoop, Spark and other distributed Big Data Systems

Hadoop-->algorithms
Spark-->algorithms
SENTENCE: Hadoop stack (HIVE, Pig, Hadoop streaming) and MapReduce

HIVE-->Hadoop
HIVE-->MapReduce
Pig-->Hadoop
Pig-->MapReduce
SENTENCE: Knowledge of one or more machine learning or statistical modeling tools such as R, SAS, MATLAB, or Python (scikit-learn, Theano)

R-->machine learning
SAS-->machine learning
MATLAB-->machine learning
Python-->machine learning
SENTENCE: Applied machine lear

SENTENCE: Have experience with Big Data including Hadoop, Spark, Kafka

Hadoop-->Big Data
Spark-->Big Data
Kafka-->Big Data
SENTENCE: MINIMUM EDUCATIONAL REQUIREMENTS Advanced degree in Data Science or Quantitative field (or Bachelor's degree with 5+ years' experience) ESSENTIAL SKILLS REQUIRED Passion for solving complex problems and searching for the best approach to a problem Strong critical thinking skills Strong ability to quickly grasp deep understanding of our business Expert in building machine learning and statistical models (classification, regression, clustering, etc) Profound understanding of statistics and machine learning Advanced proficiency in a high-level programming language such as Python Advanced proficiency working with databases (NoSQL) and applications to create ETL pipelines in SQL and other languages High skills in using data science languages and frameworks such as Python (pandas, scikit-learn, etc) R, etc

programming language-->machine learning
NoSQL-->datab

SENTENCE: Knowledge of the latest ML techniques like deep learning is a plus, but not a requirement

deep learning-->ML
SENTENCE: Expertise in one or more scripting languages such as Perl or Python and one or more programming languages such as Java or Scala

scripting languages-->programming languages
Perl-->scripting languages
Scala-->programming languages
SENTENCE: Able to select several potential appropriate modeling approaches for a given analytic problem (machine learning methods such as ensemble models, decision trees

decision trees-->machine learning
SENTENCE: If you have experience with Salesforce, Marketo, Google Analytics, and/or Tableau… great

Google Analytics-->Tableau
SENTENCE: Python (Scala, Unix)

Scala-->Python
SENTENCE: Applied experience with machine learning on large datasets (Spark)

large datasets-->machine learning
SENTENCE: Roles & Responsibilities Work with the data science team to analyze large data sets and develop custom models/algorithms to uncover trends,

# print all

In [20]:
ind = 0

total_link_list = []
total_link_set = set()

for root,dirs,files in os.walk(r"../JD_files"):
    for file in files:
        ind += 1
        
        my_file = open( os.path.join(root,file),encoding = 'utf-8' )
        my_string = my_file.readlines()
        
        doc_sentences = '\n'.join(my_string[1:])
        doc_sentences = remove_urls(doc_sentences)
        
        doc_sentences = doc_sentences.replace('www.','https://www.')
        doc_sentences = remove_urls(doc_sentences)

        doc_sentences = doc_sentences.replace('e.g.','like').replace('i.e.','like').replace('.)',')')
        
        delimiters = "？","?", "！","!",".","。","；",";","\n","\r","·","•"," -","*"
        regexPattern = '|'.join(map(re.escape, delimiters))
        
        sentences = [str(sentence).strip() for sentence in re.split( regexPattern, doc_sentences ) if len(str(sentence).strip()) > 0]
        
        for string in sentences:
            if (string[0] == '-' or string[0] == '*') and len(string) > 1:
                string = string[1:]
            
            doc = nlp( str(string) )
            
            my_chunks_list = []
            roor_chunks_dict = {}

            for chunk in doc.ents:
                if chunk.root.ent_type_ == 'SKILL' : # and chunk.text.lower() in skill_set_lower
                    my_chunks_list.append( chunk.root )
                    roor_chunks_dict[ chunk.root ] = chunk
                    
            
            if len( my_chunks_list ) > 1:            
                Total_List = []
                for word in my_chunks_list:
                    #####################################################################################
                    ## prep --> pobj
                    if word.dep_ == 'pobj' and word.head.dep_ == 'prep':
                        root_word = word.head.head
                        
                        if root_word in my_chunks_list:
#                             print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[word.head.head] )
                            Total_List.append( ( word, root_word ) )
                    
                        elif 'N' in word.tag_:
                            for tt in my_chunks_list:
                                if tt.head == root_word and (tt.dep_ == 'compound'):
    #                                     print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[tt] )
                                    Total_List.append( ( word, tt) )
        
                    
                    #####################################################################################
                    ## appos
                    elif word.dep_ == 'appos' and ( if_doc_id_is_left_punct( doc, word.idx-1 ) or if_doc_id_is_right_punct( doc, word.idx+1 ) ):
                        root_word = word.head
                        
                        if root_word in my_chunks_list:
#                             print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[word.head.head] )
                            Total_List.append( ( word, root_word ) )
                    
                        elif 'N' in word.tag_:
                            for tt in my_chunks_list:
                                if tt.head == root_word and (tt.dep_ == 'compound'):
    #                                     print( roor_chunks_dict[word] + '-->' + roor_chunks_dict[tt] )
                                    Total_List.append( ( word, tt) )
                    

                
                Total_List_expend = []
                for tup in Total_List:
                    for tem in my_chunks_list:
                        if is_conj_or_appos( tem, tup[0] ):
                            Total_List_expend.append( ( tem, tup[1] ) )
                    
                Total_List = Total_List + Total_List_expend
                
                print_flag = True
                for tup in Total_List:
                    if str(roor_chunks_dict[tup[0]]).lower() in skill_set_lower and str(roor_chunks_dict[tup[1]]).lower() in skill_set_lower and 'degree' not in str(roor_chunks_dict[tup[1]]).lower() and 'master' not in str(roor_chunks_dict[tup[1]]).lower() and 'bachelor' not in str(roor_chunks_dict[tup[1]]).lower() and 'phd' not in str(roor_chunks_dict[tup[1]]).lower() and 'ms' != str(roor_chunks_dict[tup[1]]).lower() and 'bs' != str(roor_chunks_dict[tup[1]]).lower():
                        if  str(roor_chunks_dict[tup[0]]).lower() + '-->' + str(roor_chunks_dict[tup[1]]).lower() not in total_link_set:       
                    
#                         if print_flag:
#                             print( '=' * 120 )
#                             print( 'SENTENCE:', string )
#                             print( 'my_chunks_list', [str(roor_chunks_dict[tem]) for tem in my_chunks_list] )
#                             print_flag = False

                            print( str(roor_chunks_dict[tup[0]]) + '-->' + str(roor_chunks_dict[tup[1]]) )
                            total_link_set.add( str(roor_chunks_dict[tup[0]]).lower() + '-->' + str(roor_chunks_dict[tup[1]]).lower() )
                            total_link_list.append( ( str(roor_chunks_dict[tup[0]]) , str(roor_chunks_dict[tup[1]]) ) )

        my_file.close()

        if ind % 500 == 0:
            print( '=' * 120 ,ind )
            

classification-->predictive modeling
regression-->predictive modeling
Economics-->Mathematics
Hadoop-->algorithms
Spark-->algorithms
HIVE-->Hadoop
Pig-->Hadoop
R-->machine learning
SAS-->machine learning
MATLAB-->machine learning
Python-->machine learning
structured and unstructured data-->machine learning
Hadoop-->Big data technologies
MapReduce-->Big data technologies
Hive-->Big data technologies
Pig-->Big data technologies
AI-->Artificial Intelligence
Spark-->Machine Learning
Tableau-->data visualization
Power BI-->data visualization
SQL-->data visualization
structured and unstructured data-->data visualization
Python-->programming skills
R-->programming languages
Python-->programming languages
SQL-->programming languages
R-->scripting languages
Python-->scripting languages
ML-->Machine learning
Python-->big data technologies
R-->big data technologies
regression-->data mining
Python-->programming language
Hadoop-->Big Data
Spark-->Big Data
Kafka-->Big Data
programming language-->mac

Python-->Spark
Pandas-->data analysis
scikit-learn-->data analysis
Spark-->analysis
SAS-->analysis
Tensorflow-->analysis
Hadoop-->analysis
classification-->machine learning models
clustering-->machine learning models
forecasting-->machine learning models
time series analysis-->machine learning models
Matplotlib-->visualization tools
AWS-->big data technologies
R-->programming skills
data structures-->data mining
predictive modeling-->data mining
design-->quantitative field
implementation-->quantitative field
relational databases-->design
regression-->machine learning
data mining-->big data
statistical analysis-->big data
predictive modeling-->big data
data manipulation-->big data
NLP-->Data Scientist
R-->databases
Python-->databases
MySQL-->databases
Java-->software development
R-->programming language
data structures-->Computer Science
problem solving-->Computer Science
CA-->Data Scientist
communication skills-->Data Scientist
data mining-->data analytics
statistical analysis-->data a

implementation-->AI
analysis-->quantitative field
anomaly detection-->Automation
data-->regression
S3-->Data warehouse
Pandas-->data science
Numpy-->data science
R-->Keras
SAS-->Keras
Tableau-->Keras
Python-->Keras
Perl-->Keras
Hadoop-->large datasets
data mining-->large datasets
Pandas-->data visualization tools
programming languages-->Machine Learning
SQL-->big data technologies
MySQL-->big data technologies
Git-->big data technologies
A/B testing-->experimental design
statistical analysis-->R
predictive modeling-->machine learning
anomaly detection-->machine learning
data visualization-->data analytics
data structures-->SQL
Hadoop-->data manipulation
programming-->data manipulation
Python-->data manipulation
JavaScript-->data manipulation
ETL-->data manipulation
R-->ML
TensorFlow-->machine learning techniques
Pandas-->Hive
Scikit-learn-->Hive
data analytics-->data engineers
reports-->data engineers
SPSS-->large datasets
SAS-->large datasets
Excel-->large datasets
Python-->Statistics

Anomaly Detection-->Data Science
SPSS-->modeling
Tableau-->modeling
data visualization tools-->MatLab
dashboards-->data visualization
decision making-->analytics
statistical analysis-->analytics
Tableau-->analytics
design-->analytics
data analysis-->SAS
HDFS-->Python
patterns-->Data Mining
HBase-->Hadoop
Tableau-->data analytics
Statistical-->SAS
statistics-->Matlab
NoSQL-->Database
Data Science-->Analytics
data warehouse-->data modeling
SQL-->data modeling
Machine Learning-->statistical
Artificial Intelligence-->statistical
data analysis-->SPSS
design-->automation
MongoDB-->SQL
data models-->data modeling
A/B testing-->data pipelines
model development-->data pipelines
time series analysis-->statistical techniques
Extract-->ETL
ML-->data analysis
NLP-->data analysis
machine learning-->data management
SQL-->data analysis
Google Analytics-->Natural language processing
Google Analytics-->text mining
Tableau-->large datasets
Azure-->AWS
prediction-->machine learning techniques
classificati

HBase-->big data technologies
Unix-->programming languages
data scientist-->Reporting
statistical analysis-->statistical methods
R-->statistical methods
SPSS-->statistical methods
Statistics-->Machine Learning
ML-->Analytics
NLP-->AI
Text Mining-->AI
machine learning models-->Feature engineering
data analysis-->statistical modeling
SAS-->scripting languages
statistical software-->relational databases
Hive-->data processing
data science-->Machine Learning
identify-->data mining
analysis-->Building
Python-->Advanced analytics
R-->Advanced analytics
SAS-->Advanced analytics
SPSS-->Advanced analytics
data analysis-->programming languages
NumPy-->R
data mining-->predictive analytics
MapReduce-->Big Data
PIG-->Big Data
JavaScript-->R
development-->databases
analysis-->databases
Redshift-->big data technologies
Analytics-->Data Science
finance-->quantitative discipline
analytical-->Data Scientist
ETL-->automation
C/C++-->scripting languages
statistical modeling-->scripting languages
machine l

Machine Learning-->AI
Scikit-Learn-->machine learning
Data Warehouse-->Agile
Power BI-->Data Scientist
computer science-->R
numPy-->Statistical modeling
Matlab-->Statistical modeling
econometrics-->R
mathematics-->R
machine learning-->clustering
AI-->advanced analytics
git-->software development
Big Data technologies-->engineering
natural language processing-->Python
AI-->Python
clustering-->statistical techniques
ML-->quantitative discipline
relational databases-->A/B testing
SQL-->A/B testing
R-->A/B testing
Python-->A/B testing
data modeling-->SAS
Math-->R
quantitative discipline-->R
Linux-->engineering
programming languages-->engineering
C++-->engineering
analysis-->Statistics
Matlab-->machine learning models
SAS-->machine learning models
marketing-->mathematics
computer science-->mathematics
engineering-->mathematics
classification-->Building
regression-->Building
big data technologies-->Hadoop
big data technologies-->big data technologies
Tableau-->R
natural language processing--

In [21]:
link_df = pd.DataFrame( total_link_list )
link_df = link_df.rename( columns = {0:'node1',1:'node2'} )
link_df = link_df.drop_duplicates()
link_df_temp = link_df
# link_df = link_df.sample(500)
link_df

Unnamed: 0,node1,node2
0,classification,predictive modeling
1,regression,predictive modeling
2,Economics,Mathematics
3,Hadoop,algorithms
4,Spark,algorithms
...,...,...
1512,Python,technology
1513,SQL,technology
1514,ML,data science
1515,SQL,modeling


# Skill Graph

In [22]:
temp_skill_set = skill_set.copy()

In [23]:
max_size = 0
min_size = 0
for k,v in skill_set.items():
    if v > max_size:
        max_size = v
    
    if v < min_size:
        min_size = v
        
for k,v in skill_set.items():
    skill_set[k] = 15 + 85 * ( (v - min_size)/ (max_size -min_size) )

In [25]:
from unittest.mock import patch
from nose.tools import assert_equal, assert_in
from pyecharts import options as opts
from pyecharts.charts.basic_charts.graph import Graph

from pyecharts import options as opts
from pyecharts.charts import Graph

nodes = []
links = []

nodes_set = []

for ind, row in link_df.iterrows():
    
    if row['node1'] != row['node2']:
        
        if row['node1'] not in nodes_set:
            nodes_set.append( row['node1'] )
            nodes.append( {'name': row['node1'] , "symbolSize":  skill_set[row['node1']] } )
            
        if row['node2'] not in nodes_set:
            nodes_set.append( row['node2'] )
            nodes.append( {'name': row['node2'] , "symbolSize": skill_set[row['node2']]  } )
        
        links.append({"source": row['node2'], "target": row['node1']})

Graph().add(
    "", 
    nodes, 
    links, 
    repulsion=8000,
#     linestyle_opts=opts.LineStyleOpts(width=2, curve=0.1, opacity=0.9),
    linestyle_opts=opts.LineStyleOpts(width=0.5, curve=0.3, opacity=0.7),
    label_opts=opts.LabelOpts( ),
    categories=None,
    is_focusnode=True,
#     layout="none",
    is_roam=True
    
).set_global_opts(title_opts=opts.TitleOpts(title="")).render_notebook()

  super().__init__(init_opts=init_opts)
