# Modeling

- **Bag of Words** `sklearn.feature_extraction.text.CountVectorizer`
- **TF-IDF**: Term Frequency; Inverse Document Frequency

In [16]:
from imports import *
from prepare import prep_data, basic_clean, lemmatize, remove_stopwords, split
from pprint import pprint

In [2]:
# Getting data
df = pd.read_csv('data_science.csv')

In [3]:
# Initial cleaning
df = prep_data(df)

In [6]:
def clean(text):
    return remove_stopwords(lemmatize(basic_clean(text)), extra_words = ['experience', 'ability', 'skill'])

In [5]:
def clean_skills(text):
    return remove_stopwords(lemmatize(basic_clean(text)), extra_words = ['year', 'experience', 'ability', 'skill', 'programming', 'language'])

In [7]:
df.requirements = df.requirements.apply(clean)

In [8]:
df.skills = df.skills.apply(clean_skills)

In [9]:
train, validate, test = split(df)
train.shape, validate.shape, test.shape

((119, 13), (52, 13), (43, 13))

***

## Bag of Words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bag_of_words = cv.fit_transform(train.requirements)

In [12]:
bag_of_words

<119x2145 sparse matrix of type '<class 'numpy.int64'>'
	with 9691 stored elements in Compressed Sparse Row format>

**Sparse Matrix**

In [13]:
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [14]:
cv.get_feature_names_out()

array(['10', '100', '10sql', ..., 'youre', 'youve', 'zero'], dtype=object)

In [15]:
cv.vocabulary_

{'bachelor': 195,
 'degree': 530,
 'relevant': 1646,
 'field': 798,
 'year': 2139,
 'work': 2116,
 'strong': 1895,
 'handson': 912,
 'sql': 1849,
 'data': 492,
 'management': 1188,
 'database': 493,
 'design': 555,
 'including': 976,
 'ssa': 1857,
 'ssrs': 1859,
 'power': 1489,
 'bi': 224,
 'excellent': 741,
 'problem': 1538,
 'solving': 1820,
 'communication': 365,
 'dbaedaetl': 507,
 'technical': 1959,
 'desire': 558,
 'learn': 1122,
 'new': 1316,
 'software': 1812,
 'method': 1239,
 'familiar': 784,
 'cloud': 332,
 'service': 1770,
 'offering': 1356,
 'azure': 191,
 'aws': 189,
 'healthcare': 917,
 'pharmacy': 1451,
 'managed': 1187,
 'care': 280,
 'health': 916,
 'plan': 1464,
 'hospital': 938,
 'pharmaceutical': 1450,
 'previous': 1520,
 'engineer': 689,
 'similar': 1789,
 'role': 1697,
 'numerical': 1340,
 'analytical': 101,
 'statistic': 1871,
 'statistical': 1872,
 'modeling': 1266,
 'big': 226,
 'application': 125,
 'process': 1542,
 'dashboarding': 490,
 'powerbisql': 1491,
 

In [17]:
pprint(train.requirements)
# Taking a look at the bag of words transformation for education and diagnostics.
# In practice this is not necesssary and the resulting data might be to big to be reasonably helpful.
bow = pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names_out())

85     bachelor's degree relevant field 5 year relevant work strong handson sql data manageme...
33     bachelor's degree technical discipline computer science electrical engineering chemica...
151    b computer science quantitative field eg applied math statistic proficiency data model...
173    master's degree possessing stated degree preferred comcast also may consider applicant...
202    create report data visualization guide decisionmaking across purchasing team drive pro...
140    bachelor degree business analytics data science mathematics statistic quantitative dis...
9      5 year relevant work data modelinganalytics data management data architecture data eng...
170    good balance business acumen technical knowledge must connect dot different requiremen...
176    4 year developing machine learning solution deployed production environment 6 year sof...
206    3 year data analyst sql script snowflake tableaubi tool python preferred aws familiari...
40     5 year professional sof

In [18]:
bow

Unnamed: 0,10,100,10sql,12,1st,1year,2023,2024,23,25,...,xml,yahoo,yammer,yarn,year,yearspython,youll,youre,youve,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [19]:
bow.apply(lambda row: row / row.sum(), axis=1)

Unnamed: 0,10,100,10sql,12,1st,1year,2023,2024,23,25,...,xml,yahoo,yammer,yarn,year,yearspython,youll,youre,youve,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012821,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004975,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00885,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.007299,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.022472,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,0.0,0.0
