Data Preprocessing

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import joblib

# Load your dataset
data = pd.read_csv(r"C:\Users\Niti\NEXT HIKES\PROJECT 8\jobs.csv")

# Ensure all text columns are of type string
text_columns = ['job_title', 'company', 'location']
for col in text_columns:
    data[col] = data[col].astype(str)

# Preprocess the text data
tfidf_job_title = TfidfVectorizer(max_features=100)
tfidf_company = TfidfVectorizer(max_features=100)
tfidf_location = TfidfVectorizer(max_features=100)

X_job_title = tfidf_job_title.fit_transform(data['job_title'])
X_company = tfidf_company.fit_transform(data['company'])
X_location = tfidf_location.fit_transform(data['location'])

# Combine the TF-IDF features with the numerical salary feature
X = pd.concat([
    pd.DataFrame(X_job_title.toarray(), index=data.index),
    pd.DataFrame(X_company.toarray(), index=data.index),
    pd.DataFrame(X_location.toarray(), index=data.index)
], axis=1)
X.columns = X.columns.astype(str)  # Convert column names to strings

y = data['salary']  # Assuming 'salary' is the target variable for prediction

# Define the model pipeline
model = Pipeline([
    ('regressor', RandomForestRegressor())
])

# Fit the model
model.fit(X, y)

# Save the model and the TF-IDF vectorizers
joblib.dump(model, 'budget_predictor_model.pkl')
joblib.dump(tfidf_job_title, 'tfidf_job_title.pkl')
joblib.dump(tfidf_company, 'tfidf_company.pkl')
joblib.dump(tfidf_location, 'tfidf_location.pkl')


['tfidf_location.pkl']

In [1]:
import pandas as pd

# Load the data
df = pd.read_csv(r"C:\Users\Niti\NEXT HIKES\PROJECT 8\all_upwork_jobs_next.csv")

# Convert 'published_date' to datetime
df['published_date'] = pd.to_datetime(df['published_date'])

# Extract month and year
df['month'] = df['published_date'].dt.month
df['year'] = df['published_date'].dt.year

# Handle missing values (example: fill missing budget with median)
df['budget'].fillna(df['budget'].median(), inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, columns=['country', 'category'], drop_first=True)

# Feature engineering: create new features if necessary
df['avg_salary'] = df[['hourly_low', 'hourly_high']].mean(axis=1)


Model Building

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define features and target
X = df.drop(columns=['title', 'link', 'published_date', 'budget'])
y = df['budget']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate the model    
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 144283947.7489486


Saving the Model

In [3]:
import joblib

# Save the model
joblib.dump(model, 'budget_predictor_model.pkl')


['budget_predictor_model.pkl']