In [14]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.model_selection import train_test_split  # Import train_test_split




# Load data
data = pd.read_csv("function_assignment.csv")

In [18]:
data.head(10)

Unnamed: 0,function,job_title,jds
0,Sales/ Business Development/ Account Management,Head - Digital Portfolio,No description available
1,IT/ Information Technology,Software Head,10-15 years of experience in a VFX facility Ad...
2,Software Architecting,iPhone Developers,Candidate Should have Strong OO design and pro...
3,Software Architecting,Html/css Developer Job,Job Description Must have at -least . Years ex...
4,Software Architecting,Asp.net Developer Job,Develop ASP.netweb applicationsPerform unit te...
5,Software Architecting,Android Developers - Json/mobile/xml/oops Job,No description available
6,Project/ Program Management IT,AS\400 Professional,Job Description Job holder may need to work o...
7,Production/ Manufacturing/ Engineering,Process Engineer,No description available
8,Sales/ Business Development/ Account Management,Inside Sales Account Manager - MNC,Job Description Primary roe of Inside sales A...
9,Design,VFX Artist/ Designer,Reporting to: Design Leader PURPOSE: To conve...


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10145 entries, 0 to 10144
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   function   10145 non-null  object
 1   job_title  10145 non-null  object
 2   jds        10145 non-null  object
dtypes: object(3)
memory usage: 237.9+ KB


In [20]:
# Replace missing values with "No description available"
data["jds"] = data["jds"].fillna("No description available")

In [21]:
data.head(5)

Unnamed: 0,function,job_title,jds
0,Sales/ Business Development/ Account Management,Head - Digital Portfolio,No description available
1,IT/ Information Technology,Software Head,10-15 years of experience in a VFX facility Ad...
2,Software Architecting,iPhone Developers,Candidate Should have Strong OO design and pro...
3,Software Architecting,Html/css Developer Job,Job Description Must have at -least . Years ex...
4,Software Architecting,Asp.net Developer Job,Develop ASP.netweb applicationsPerform unit te...


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10145 entries, 0 to 10144
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   function   10145 non-null  object
 1   job_title  10145 non-null  object
 2   jds        10145 non-null  object
dtypes: object(3)
memory usage: 237.9+ KB


In [23]:
# Encode categorical variables (example for job function)
data["function"] = pd.Categorical(data["function"])
data["function_encoded"] = data["function"].cat.codes


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10145 entries, 0 to 10144
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   function          10145 non-null  category
 1   job_title         10145 non-null  object  
 2   jds               10145 non-null  object  
 3   function_encoded  10145 non-null  int16   
dtypes: category(1), int16(1), object(2)
memory usage: 209.1+ KB


In [25]:
# Text cleaning functions
def clean_text(text):
  # Lowercase
  text = text.lower()
  # Remove punctuation
  text = "".join([char for char in text if char.isalnum() or char in " "])
  # Remove stop words (consider creating a custom stop word list if needed)
  stop_words = stopwords.words("english")
  text = " ".join([word for word in text.split() if word not in stop_words])
  # Stemming (optional, replace with lemmatization if preferred)
  stemmer = PorterStemmer()
  text = " ".join([stemmer.stem(word) for word in text.split()])
  return text

# Apply cleaning to jds
data["jds_cleaned"] = data["jds"].apply(clean_text)


In [26]:
data.head(5)

Unnamed: 0,function,job_title,jds,function_encoded,jds_cleaned
0,Sales/ Business Development/ Account Management,Head - Digital Portfolio,No description available,312,descript avail
1,IT/ Information Technology,Software Head,10-15 years of experience in a VFX facility Ad...,158,1015 year experi vfx facil advanc knowledg c p...
2,Software Architecting,iPhone Developers,Candidate Should have Strong OO design and pro...,338,candid strong oo design program skill objectiv...
3,Software Architecting,Html/css Developer Job,Job Description Must have at -least . Years ex...,338,job descript must least year experi depth know...
4,Software Architecting,Asp.net Developer Job,Develop ASP.netweb applicationsPerform unit te...,338,develop aspnetweb applicationsperform unit tes...


In [27]:
# Feature engineering
def prepare_features(data):
    """Prepares features for job title prediction.

    Args:
        data: A DataFrame containing a column named 'jds_cleaned' with cleaned job descriptions.

    Returns:
        A DataFrame containing TF-IDF features, n-gram features, description length, and other features.
    """

    vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_features = vectorizer.fit_transform(data["jds_cleaned"])
    tfidf_df = pd.DataFrame(tfidf_features.toarray())

    n_gram_features = []
    for text in data["jds_cleaned"]:
        n_grams = []
        for n in (2, 3):  # Example for bigrams and trigrams
            n_grams.extend(extract_ngrams(text, n))
        n_gram_features.append(n_grams)
    n_gram_df = pd.DataFrame(n_gram_features)

    all_features = pd.concat([tfidf_df, n_gram_df], axis=1)
    all_features["description_length"] = data["jds_cleaned"].str.len()

    # Convert all column names to strings (recommended)
    all_features.columns = all_features.columns.astype(str)

    return all_features  # The return statement should be at this level, not indented further

def extract_ngrams(text, n):
    """Extracts n-grams from a text string.

    Args:
        text: The text string to extract n-grams from.
        n: The number of words in each n-gram.

    Returns:
        A list of n-gram phrases.
    """
    return [ngram for ngram in ngrams(text.split(), n)]

# Generate features
all_features = prepare_features(data)


In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np  # For potential class label extraction
from sklearn.preprocessing import LabelEncoder  # For handling label encoding# ... (your code for data loading, cleaning, feature engineering)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_features, data["job_title"], test_size=0.2)

# Check data types (informative message)
print(f"Type of X_train features: {X_train.dtypes}")
if len(y_train) > 0:
  # Print the type of the first element if y_train is not empty
  print(f"Type of the first element in y_train labels: {type(y_train[0])}")
else:
  print("y_train is empty")



# Handle missing values (if any)
# ... (consider removing rows, imputing missing values, etc.)

# Encode labels (assuming y_train has string labels)
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
# Check for object columns
if any(X_train.dtypes == object):
    for col in X_train.select_dtypes(include=['object']):
        # Option 1: Remove object columns
        X_train = X_train.drop(col, axis=1)
        X_test = X_test.drop(col, axis=1)
  
# Import StandardScaler
from sklearn.preprocessing import StandardScaler      

# Scale features (optional, assuming X_train_scaled is already created)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

# ... (rest of your code for prediction and evaluation)



Type of X_train features: 0                     float64
1                     float64
2                     float64
3                     float64
4                     float64
                       ...   
4109                   object
4110                   object
4111                   object
4112                   object
description_length      int64
Length: 9114, dtype: object
Type of the first element in y_train labels: <class 'str'>


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")

# You can add further evaluation metrics or analysis here (e.g., confusion matrix)
