In [1]:
from pathlib import Path
import ast
import numpy as np
import pandas as pd
import ast
from utils.tools import (
    get_word_tokens,
    get_technical_tokens, 
    get_softskills_tokens,
    get_educational_tokens
)


In [2]:
data_path = Path("gsearch_jobs.csv")

In [3]:
raw_data = pd.read_csv(data_path, index_col=0,)

In [4]:
data = raw_data.copy()

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54261 entries, 0 to 54260
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                54261 non-null  int64  
 1   title                54261 non-null  object 
 2   company_name         54261 non-null  object 
 3   location             54224 non-null  object 
 4   via                  54252 non-null  object 
 5   description          54261 non-null  object 
 6   extensions           54261 non-null  object 
 7   job_id               54261 non-null  object 
 8   thumbnail            34147 non-null  object 
 9   posted_at            54224 non-null  object 
 10  schedule_type        54031 non-null  object 
 11  work_from_home       25651 non-null  object 
 12  salary               9299 non-null   object 
 13  search_term          54261 non-null  object 
 14  date_time            54261 non-null  object 
 15  search_location      54261 non-null  obje

In [6]:
drop_cols = ["index", "thumbnail", "posted_at", "job_id", "search_term", "commute_time", "search_location", "description_tokens"]

In [7]:
data = data.drop(drop_cols, axis=1)

In [8]:
ext = data["extensions"].str.split(pat=",",expand=True)

In [9]:
data[data["extensions"].str.contains(pat="No degree mentioned", case=False, regex=False)]["extensions"].str.split(pat=",")

3        [['12 hours ago',  '15–25 an hour',  'Work fro...
6        [['17 hours ago',  'Work from home',  'Full-ti...
13       [['7 hours ago',  'Full-time',  'No degree men...
16       [['12 hours ago',  'Full-time',  'No degree me...
19       [['17 hours ago',  'Work from home',  'Contrac...
                               ...                        
54217    [['21 hours ago',  'Full-time',  'No degree me...
54218    [['21 hours ago',  'Full-time',  'No degree me...
54245    [['22 hours ago',  'Full-time',  'No degree me...
54254    [['23 hours ago',  'Full-time',  'No degree me...
54255    [['23 hours ago',  'Full-time',  'No degree me...
Name: extensions, Length: 16569, dtype: object

In [10]:
data["extensions"].str.split(",").apply(lambda ls: len(ls)).argmax()

np.int64(3386)

In [11]:
data["extensions"].str.split("',")

0        [['15 hours ago,  '101K–143K a year,  'Work fr...
1        [['12 hours ago,  'Full-time,  'Health insuran...
2                          [['18 hours ago,  'Full-time']]
3        [['12 hours ago,  '15–25 an hour,  'Work from ...
4        [['7 hours ago,  '90K–110K a year,  'Contracto...
                               ...                        
54256    [['23 hours ago,  '76,798–130,764 a year,  'Fu...
54257    [['23 hours ago,  '106,916–182,047 a year,  'F...
54258    [['23 hours ago,  '106,916–182,047 a year,  'F...
54259    [['23 hours ago,  '106,916–182,047 a year,  'F...
54260    [['24 hours ago,  '105,850–158,780 a year,  'F...
Name: extensions, Length: 54261, dtype: object

In [12]:
ext = data["extensions"].apply(lambda row: ast.literal_eval(row))

In [13]:
def get_first_element(lst):
    if len(lst) <= 1:
        return pd.NA
    else:
        return lst[1]

In [14]:
ext[ext.apply(get_first_element).isna()]

786       [8 hours ago]
884       [6 hours ago]
885       [6 hours ago]
1628     [22 hours ago]
1668     [24 hours ago]
              ...      
52785    [17 hours ago]
52786    [17 hours ago]
52787    [17 hours ago]
52788    [17 hours ago]
54174    [18 hours ago]
Name: extensions, Length: 102, dtype: object

In [15]:
data["salary"].isna().sum()

np.int64(44962)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54261 entries, 0 to 54260
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                54261 non-null  object 
 1   company_name         54261 non-null  object 
 2   location             54224 non-null  object 
 3   via                  54252 non-null  object 
 4   description          54261 non-null  object 
 5   extensions           54261 non-null  object 
 6   schedule_type        54031 non-null  object 
 7   work_from_home       25651 non-null  object 
 8   salary               9299 non-null   object 
 9   date_time            54261 non-null  object 
 10  salary_pay           9299 non-null   object 
 11  salary_rate          9299 non-null   object 
 12  salary_avg           9299 non-null   float64
 13  salary_min           8744 non-null   float64
 14  salary_max           8744 non-null   float64
 15  salary_hourly        5655 non-null   floa

In [17]:
data["description_tokens"] = data["description"].apply(get_word_tokens)

In [18]:
data["technical_tokens"] = data["description_tokens"].apply(get_technical_tokens)
data["softskills_tokens"] = data["description_tokens"].apply(get_softskills_tokens)
data["education_tokens"] = data["description_tokens"].apply(get_educational_tokens)

In [19]:
series = data["technical_tokens"].head()

In [20]:
def count_tokens(series) -> pd.DataFrame:
    series = series.dropna()
    
    arr = np.concatenate(series.values)
    
    unique_tokens, counts = np.unique(arr, return_counts=True)
    
    stacked_arr = np.column_stack((unique_tokens, counts))
    
    return pd.DataFrame(stacked_arr, columns=["tokens", "count"])
    

In [21]:
tech = count_tokens(data["technical_tokens"])

In [22]:
tech["tokens"] = tech["tokens"].str.capitalize()

In [23]:
tech["count"] = tech["count"].astype("int64")

In [24]:
total = len(data)

In [25]:
tech["percent"] = (tech["count"] / total * 100).round(2)

In [33]:
tech.sort_values("percent", ascending=False).head(20)

Unnamed: 0,tokens,count,percent
201,Sql,28076,51.74
93,Excel,18057,33.28
177,Python,17288,31.86
213,Tableau,15445,28.46
207,Statistics,13439,24.77
50,Dashboards,13210,24.35
180,R,11373,20.96
167,Power bi,10953,20.19
75,Data visualization,10809,19.92
133,Machine learning,5808,10.7


In [27]:
soft = count_tokens(data["softskills_tokens"])

In [28]:
soft["count"] = soft["count"].astype("int64")

In [31]:
soft.sort_values("count", ascending=False).head(15)

Unnamed: 0,tokens,count
14,communication skills,15358
49,problem solving,12065
40,leadership,11991
29,decision making,9241
8,business intelligence,9190
31,documentation,8580
5,attention to detail,7878
13,collaboration,7860
24,data management,7564
48,presentation,6742


In [30]:
type(pd.NA)

pandas._libs.missing.NAType

In [38]:
ed = count_tokens(data["education_tokens"])

In [40]:
ed

Unnamed: 0,tokens,count
0,accredited university,274
1,advanced degree,1930
2,b a degree,16
3,b s degree,68
4,ba degree,312
5,bachelor,16516
6,bachelor degree,349
7,bachelors,740
8,bachelors degree,1256
9,bs degree,927


In [45]:
a = [0,1,3]
["arr_" + str(i) for i in range(len(a))]

['arr_0', 'arr_1', 'arr_2']