# Feature Engineering

### Feature Extraction Methods:

- **Bag of Words** `sklearn.feature_extraction.text.CountVectorizer`
- **TF-IDF**: Term Frequency; Inverse Document Frequency

### Features:
- `requirements`
- `skills`

In [1]:
from imports import *
from prepare import prep_data, basic_clean, lemmatize, remove_stopwords, split
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Getting data
df = pd.read_csv('data_science.csv')

In [3]:
# Initial cleaning
df = prep_data(df)

In [4]:
def clean(text):
    return remove_stopwords(lemmatize(basic_clean(text)), extra_words = ['experience', 'ability', 'skill'])

In [5]:
def clean_skills(text):
    return remove_stopwords(lemmatize(basic_clean(text)), extra_words = ['year', 'experience', 'ability', 'skill', 'programming', 'language'])

In [6]:
df.requirements = df.requirements.apply(clean)

In [7]:
df.requirements

0      bachelor degree minimum year 4 year demonstrates thorough andor proven record success ...
1      identify execute predictive model help internal team masterworks understand artist mar...
2      selfmotivated highly disciplined passionate discovering right therapeutic right patien...
3      2 year python java objectoriented programming language handson understanding objectori...
4      2 year work quantitative analysis tackle business problem strong analytical including ...
                                                 ...                                            
209    participate data team ideation session define analytics reporting project solution wor...
210    demonstrates advanced mastery knowledge data system various analytical tool identify c...
211    associate degree related field two year related andor training equivalent combination ...
212    maintain existing feature troubleshoot bug resolve adhoc request provide support enhan...
213    requires training field

In [8]:
df.skills = df.skills.apply(clean_skills)

In [9]:
train, validate, test = split(df)
train.shape, validate.shape, test.shape

((119, 17), (52, 17), (43, 17))

***

## Bag of Words

In [11]:
cv = CountVectorizer()
bag_of_words = cv.fit_transform(train.requirements)

In [12]:
bag_of_words

<119x2145 sparse matrix of type '<class 'numpy.int64'>'
	with 9691 stored elements in Compressed Sparse Row format>

**Sparse Matrix**

In [13]:
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [14]:
cv.get_feature_names_out()

array(['10', '100', '10sql', ..., 'youre', 'youve', 'zero'], dtype=object)

In [15]:
cv.vocabulary_

{'bachelor': 195,
 'degree': 530,
 'relevant': 1646,
 'field': 798,
 'year': 2139,
 'work': 2116,
 'strong': 1895,
 'handson': 912,
 'sql': 1849,
 'data': 492,
 'management': 1188,
 'database': 493,
 'design': 555,
 'including': 976,
 'ssa': 1857,
 'ssrs': 1859,
 'power': 1489,
 'bi': 224,
 'excellent': 741,
 'problem': 1538,
 'solving': 1820,
 'communication': 365,
 'dbaedaetl': 507,
 'technical': 1959,
 'desire': 558,
 'learn': 1122,
 'new': 1316,
 'software': 1812,
 'method': 1239,
 'familiar': 784,
 'cloud': 332,
 'service': 1770,
 'offering': 1356,
 'azure': 191,
 'aws': 189,
 'healthcare': 917,
 'pharmacy': 1451,
 'managed': 1187,
 'care': 280,
 'health': 916,
 'plan': 1464,
 'hospital': 938,
 'pharmaceutical': 1450,
 'previous': 1520,
 'engineer': 689,
 'similar': 1789,
 'role': 1697,
 'numerical': 1340,
 'analytical': 101,
 'statistic': 1871,
 'statistical': 1872,
 'modeling': 1266,
 'big': 226,
 'application': 125,
 'process': 1542,
 'dashboarding': 490,
 'powerbisql': 1491,
 

In [17]:
pprint(train.requirements)
# Taking a look at the bag of words transformation for education and diagnostics.
# In practice this is not necesssary and the resulting data might be to big to be reasonably helpful.
bow = pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names_out())

85     bachelor's degree relevant field 5 year relevant work strong handson sql data manageme...
33     bachelor's degree technical discipline computer science electrical engineering chemica...
151    b computer science quantitative field eg applied math statistic proficiency data model...
173    master's degree possessing stated degree preferred comcast also may consider applicant...
202    create report data visualization guide decisionmaking across purchasing team drive pro...
140    bachelor degree business analytics data science mathematics statistic quantitative dis...
9      5 year relevant work data modelinganalytics data management data architecture data eng...
170    good balance business acumen technical knowledge must connect dot different requiremen...
176    4 year developing machine learning solution deployed production environment 6 year sof...
206    3 year data analyst sql script snowflake tableaubi tool python preferred aws familiari...
40     5 year professional sof

In [18]:
bow

Unnamed: 0,10,100,10sql,12,1st,1year,2023,2024,23,25,...,xml,yahoo,yammer,yarn,year,yearspython,youll,youre,youve,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [19]:
bow.apply(lambda row: row / row.sum(), axis=1)

Unnamed: 0,10,100,10sql,12,1st,1year,2023,2024,23,25,...,xml,yahoo,yammer,yarn,year,yearspython,youll,youre,youve,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012821,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004975,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00885,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.007299,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.022472,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,0.0,0.0


## TF-IDF

In [9]:
tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(train.requirements)
pd.DataFrame(bag_of_words.todense(), columns=tfidf.get_feature_names_out())

Unnamed: 0,10,100,10sql,12,1st,1year,2023,2024,23,25,...,xml,yahoo,yammer,yarn,year,yearspython,youll,youre,youve,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.043838,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.119235,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.260086,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.075849,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.029824,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.030011,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.075418,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.048174,0.0,0.0,0.0,0.0,0.0


In [10]:
pprint(train.requirements)
pd.Series(dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))).sort_values()

85     bachelor's degree relevant field 5 year relevant work strong handson sql data manageme...
33     bachelor's degree technical discipline computer science electrical engineering chemica...
151    b computer science quantitative field eg applied math statistic proficiency data model...
173    master's degree possessing stated degree preferred comcast also may consider applicant...
202    create report data visualization guide decisionmaking across purchasing team drive pro...
140    bachelor degree business analytics data science mathematics statistic quantitative dis...
9      5 year relevant work data modelinganalytics data management data architecture data eng...
170    good balance business acumen technical knowledge must connect dot different requiremen...
176    4 year developing machine learning solution deployed production environment 6 year sof...
206    3 year data analyst sql script snowflake tableaubi tool python preferred aws familiari...
40     5 year professional sof

data            1.042560
science         1.244197
year            1.254892
python          1.265703
language        1.265703
                  ...   
hundred         5.094345
humanity        5.094345
humancentric    5.094345
huge            5.094345
zero            5.094345
Length: 2145, dtype: float64

## Bag of Ngrams

Setting `ngram_range` parameter for `CountVectorizer` and `TfidVectorizer`

In [14]:
cv = CountVectorizer(ngram_range=(2, 2))
bag_of_words = cv.fit_transform(train.requirements)

pprint(train.requirements)

85     bachelor's degree relevant field 5 year relevant work strong handson sql data manageme...
33     bachelor's degree technical discipline computer science electrical engineering chemica...
151    b computer science quantitative field eg applied math statistic proficiency data model...
173    master's degree possessing stated degree preferred comcast also may consider applicant...
202    create report data visualization guide decisionmaking across purchasing team drive pro...
140    bachelor degree business analytics data science mathematics statistic quantitative dis...
9      5 year relevant work data modelinganalytics data management data architecture data eng...
170    good balance business acumen technical knowledge must connect dot different requiremen...
176    4 year developing machine learning solution deployed production environment 6 year sof...
206    3 year data analyst sql script snowflake tableaubi tool python preferred aws familiari...
40     5 year professional sof

In [15]:
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names_out())

Unnamed: 0,10 year,100 million,10sql statistical,12 year,1st 3rd,1year data,2023 august,2024 knowledge,23 year,25 year,...,year using,year work,year working,year wrangling,year year,yearspython programming,youll bring,youre compelling,youve created,zero four
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


***

## Modeling

#### Spliting X and Y

In [10]:
train.columns

Index(['company', 'location', 'mode', 'type', 'level', 'role', 'requirements',
       'skills', 'associate', 'entry', 'mid_senior', 'analyst', 'engineer',
       'maganer', 'scientist', 'edu_b_dmnt', 'label'],
      dtype='object')

In [11]:
cols = ['requirements', 'skills', 'associate', 'entry', 'analyst', 'engineer','scientist']
X_train = train[cols]
y_train = train.label
X_validate = validate[cols]
y_validate = validate.label
X_test = test[cols]
y_test = test.label

In [12]:
X_train.head()

Unnamed: 0,requirements,skills,associate,entry,analyst,engineer,scientist
85,bachelor's degree relevant field 5 year relevant work strong handson sql data manageme...,sql python microsoft excel microsoft powerpoint r public speaking matlab c microsoft o...,0,1,0,1,0
33,bachelor's degree technical discipline computer science electrical engineering chemica...,sql python machine learning microsoft excel leadership data science c panda software c...,0,1,0,1,0
151,b computer science quantitative field eg applied math statistic proficiency data model...,sql python artificial intelligence ai machine learning tableau r statistical modeling ...,0,0,0,1,0
173,master's degree possessing stated degree preferred comcast also may consider applicant...,python machine learning sql r statistic data analytics leadership deep learning panda ...,0,1,0,0,1
202,create report data visualization guide decisionmaking across purchasing team drive pro...,sql python machine learning javascript microsoft sql server cascading style sheet cs d...,0,1,1,0,0


#### Baseline

In [37]:
train.label.mode()

0    h
Name: label, dtype: object

In [34]:
train['baseline_pred'] = 'h'

In [38]:
baseline_accuracy = (train.label == train.baseline_pred).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 91.60%


#### Decision Tree (bag of words, bigram)

In [13]:
# import classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

In [14]:
cv = CountVectorizer(ngram_range=(2, 2))
X_train.requirements = cv.fit_transform(X_train.requirements)
X_train.skills = cv.fit_transform(X_train.skills)

In [15]:
X_validate.requirements = cv.transform(X_validate.requirements)
X_validate.skills = cv.transform(X_validate.skills)

In [16]:
X_train.head()

Unnamed: 0,requirements,skills,associate,entry,analyst,engineer,scientist
85,bachelor's degree relevant field 5 year relevant work strong handson sql data manageme...,sql python microsoft excel microsoft powerpoint r public speaking matlab c microsoft o...,0,1,0,1,0
33,bachelor's degree technical discipline computer science electrical engineering chemica...,sql python machine learning microsoft excel leadership data science c panda software c...,0,1,0,1,0
151,b computer science quantitative field eg applied math statistic proficiency data model...,sql python artificial intelligence ai machine learning tableau r statistical modeling ...,0,0,0,1,0
173,master's degree possessing stated degree preferred comcast also may consider applicant...,python machine learning sql r statistic data analytics leadership deep learning panda ...,0,1,0,0,1
202,create report data visualization guide decisionmaking across purchasing team drive pro...,sql python machine learning javascript microsoft sql server cascading style sheet cs d...,0,1,1,0,0


In [17]:
# max depth vs. model score, comparing training & validate datasets
metrics = []

for i in range(1, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df.sort_values(by = ['validate_accuracy', 'difference'], ascending = [False, True])

# Visualizing model performance as we change the max depth, check if there's overfitting
plt.figure(figsize=(12, 6))
plt.plot(df.max_depth, df.train_accuracy, marker = 'o')
plt.plot(df.max_depth, df.validate_accuracy, marker = 'o')
plt.title('Overfitting Occurs at Higher Values for Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.show()

ValueError: could not convert string to float: "bachelor's degree relevant field 5 year relevant work strong handson sql data management database design including ssa ssrs power bi excellent problem solving communication dbaedaetl technical desire learn new software method familiar cloud service offering azure aws healthcare pharmacy managed care health plan hospital pharmaceutical previous data engineer similar role excellent numerical analytical familiar statistic statistical modeling big data application process data dashboarding powerbisql python programming language microsoft excel microsoft powerpoint r programming language public speaking matlab c microsoft office latex"