In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("dataset/Salary Data.csv")

In [3]:
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [4]:
df.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [5]:
df.shape

(375, 6)

In [6]:
for column in df.columns:
    print(df[column].unique())

[32. 28. 45. 36. 52. 29. 42. 31. 26. 38. 48. 35. 40. 27. 44. 33. 39. 25.
 51. 34. 47. 30. 41. 37. 24. 43. 50. 46. 49. 23. 53. nan]
['Male' 'Female' nan]
["Bachelor's" "Master's" 'PhD' nan]
['Software Engineer' 'Data Analyst' 'Senior Manager' 'Sales Associate'
 'Director' 'Marketing Analyst' 'Product Manager' 'Sales Manager'
 'Marketing Coordinator' 'Senior Scientist' 'Software Developer'
 'HR Manager' 'Financial Analyst' 'Project Manager' 'Customer Service Rep'
 'Operations Manager' 'Marketing Manager' 'Senior Engineer'
 'Data Entry Clerk' 'Sales Director' 'Business Analyst' 'VP of Operations'
 'IT Support' 'Recruiter' 'Financial Manager' 'Social Media Specialist'
 'Software Manager' 'Junior Developer' 'Senior Consultant'
 'Product Designer' 'CEO' 'Accountant' 'Data Scientist'
 'Marketing Specialist' 'Technical Writer' 'HR Generalist'
 'Project Engineer' 'Customer Success Rep' 'Sales Executive' 'UX Designer'
 'Operations Director' 'Network Engineer' 'Administrative Assistant'
 'Strateg

In [7]:
df.isnull().sum()

Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64

In [8]:
print(df[df.isna().any(axis=1)])

     Age Gender Education Level Job Title  Years of Experience  Salary
172  NaN    NaN             NaN       NaN                  NaN     NaN
260  NaN    NaN             NaN       NaN                  NaN     NaN


In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [11]:
df.drop(columns=["Job Title"], axis = 1, inplace = True)

In [12]:
df.head(2)

Unnamed: 0,Age,Gender,Education Level,Years of Experience,Salary
0,32.0,Male,Bachelor's,5.0,90000.0
1,28.0,Female,Master's,3.0,65000.0


In [13]:
X = df.iloc[:,:-1]
y = df["Salary"]

In [14]:
X

Unnamed: 0,Age,Gender,Education Level,Years of Experience
0,32.0,Male,Bachelor's,5.0
1,28.0,Female,Master's,3.0
2,45.0,Male,PhD,15.0
3,36.0,Female,Bachelor's,7.0
4,52.0,Male,Master's,20.0
...,...,...,...,...
370,35.0,Female,Bachelor's,8.0
371,43.0,Male,Master's,19.0
372,29.0,Female,Bachelor's,2.0
373,34.0,Male,Bachelor's,7.0


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
# Initialize the label encoder
label_encoder = LabelEncoder()

In [18]:
# Fit and transform on training set
X_train["Gender"]= label_encoder.fit_transform(X_train["Gender"])

In [19]:
X_train.head(2)

Unnamed: 0,Age,Gender,Education Level,Years of Experience
36,35.0,0,Bachelor's,7.0
158,51.0,0,PhD,23.0


In [20]:
# Fit and transform on training set
X_test["Gender"]= label_encoder.transform(X_test["Gender"])

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
# One-hot encode the 'Category' column
onehot_encoder = OneHotEncoder()

In [26]:
X_train_encoded = onehot_encoder.fit_transform(X_train[['Education Level']]).toarray()

In [27]:
X_train_final = np.hstack([X_train_encoded, X_train.drop(columns=['Education Level']).values])

In [28]:
# Combine encoded and numeric columns as a DataFrame
column_names = list(onehot_encoder.get_feature_names_out(['Education Level'])) + X_train.drop(columns=['Education Level']).columns.tolist()
X_train_final = pd.DataFrame(X_train_final, columns=column_names)

In [29]:
print(X_train_final.head(2))

   Education Level_Bachelor's  Education Level_Master's  Education Level_PhD  \
0                         1.0                       0.0                  0.0   
1                         0.0                       0.0                  1.0   

    Age  Gender  Years of Experience  
0  35.0     0.0                  7.0  
1  51.0     0.0                 23.0  


In [30]:
X_train.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Years of Experience    0
dtype: int64

In [31]:
X_test_encoded = onehot_encoder.fit_transform(X_test[['Education Level']]).toarray()

In [32]:
X_test_final = np.hstack([X_test_encoded, X_test.drop(columns=['Education Level']).values])

In [33]:
from sklearn.neighbors import KNeighborsRegressor

In [34]:
regressor=KNeighborsRegressor(n_neighbors=6,algorithm='auto')
regressor.fit(X_train_final, y_train)

In [36]:
y_pred=regressor.predict(X_test_final)



In [37]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [38]:
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))

0.8872330378555893
10736.424731182797
254207081.09318995


In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
# defining parameter range
param_grid = {'n_neighbors' : [1,2,3,4,5,6,7,8,9,10]}

In [41]:
grid=GridSearchCV(KNeighborsRegressor(),param_grid=param_grid,refit=True,cv=5,verbose=3)

In [42]:
grid.fit(X_train_final, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .....................n_neighbors=1;, score=0.835 total time=   0.0s
[CV 2/5] END .....................n_neighbors=1;, score=0.843 total time=   0.0s
[CV 3/5] END .....................n_neighbors=1;, score=0.789 total time=   0.0s
[CV 4/5] END .....................n_neighbors=1;, score=0.748 total time=   0.0s
[CV 5/5] END .....................n_neighbors=1;, score=0.754 total time=   0.0s
[CV 1/5] END .....................n_neighbors=2;, score=0.824 total time=   0.0s
[CV 2/5] END .....................n_neighbors=2;, score=0.847 total time=   0.0s
[CV 3/5] END .....................n_neighbors=2;, score=0.859 total time=   0.0s
[CV 4/5] END .....................n_neighbors=2;, score=0.806 total time=   0.0s
[CV 5/5] END .....................n_neighbors=2;, score=0.815 total time=   0.0s
[CV 1/5] END .....................n_neighbors=3;, score=0.859 total time=   0.0s
[CV 2/5] END .....................n_neighbors=3;

  _data = np.array(data, dtype=dtype, copy=copy,


In [43]:
grid.best_params_

{'n_neighbors': 7}

In [46]:
y_pred2=grid.predict(X_test_final)
print(r2_score(y_test,y_pred2))
print(mean_absolute_error(y_test,y_pred2))
print(mean_squared_error(y_test,y_pred2))

0.8885202029630282
11034.043778801846
251305464.53258723




In [47]:
import pickle

In [48]:
# Create a pickle file and write the data
with open('reg_model.pkl', 'wb') as file:
    pickle.dump(grid, file)

In [49]:
# Create a pickle file and write the data
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

In [50]:
# Create a pickle file and write the data
with open('onehot_encoder.pkl', 'wb') as file:
    pickle.dump(onehot_encoder, file)