In [1554]:
import pandas as pd

train_data = pd.read_csv('salary_train.csv', header=0, sep=',')

train_data.head(3)

Unnamed: 0,Id,algebra,programming,data science,robotics,economics,job,salary
0,0,87,62,86,61,90,junior developer,140000
1,1,76,84,76,80,79,data scientist,780000
2,2,56,55,99,82,98,developer,210000


In [1555]:
predict_data = pd.read_csv('salary_predict.csv', header=0, sep=',')

predict_data.head(3)

Unnamed: 0,Id,algebra,programming,data science,robotics,economics,job,salary
0,9000,73,59,57,54,61,robotics engineer,0
1,9001,77,80,53,93,80,senior developer,0
2,9002,95,72,88,63,84,developer,0


In [1556]:
# проверяю наличие nan
assert train_data.isna().sum().sum() == 0
assert predict_data.isna().sum().sum() == 0

In [1557]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            9000 non-null   int64 
 1   algebra       9000 non-null   int64 
 2   programming   9000 non-null   int64 
 3   data science  9000 non-null   int64 
 4   robotics      9000 non-null   int64 
 5   economics     9000 non-null   int64 
 6   job           9000 non-null   object
 7   salary        9000 non-null   int64 
dtypes: int64(7), object(1)
memory usage: 562.6+ KB


In [1558]:
predict_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            1000 non-null   int64 
 1   algebra       1000 non-null   int64 
 2   programming   1000 non-null   int64 
 3   data science  1000 non-null   int64 
 4   robotics      1000 non-null   int64 
 5   economics     1000 non-null   int64 
 6   job           1000 non-null   object
 7   salary        1000 non-null   int64 
dtypes: int64(7), object(1)
memory usage: 62.6+ KB


In [1559]:
# заменяю наименования должностей на категорию
train_jobs = train_data.job.unique()
predict_jobs = predict_data.job.unique()

assert len(train_jobs) == len(predict_jobs)

for job in train_jobs:
    assert job in predict_jobs

jobs_dict = dict(zip(train_jobs, range(1, len(train_jobs) + 1)))

for job in jobs_dict.keys():
    train_data[job] = train_data.job.map({job: 1})
    predict_data[job] = predict_data.job.map({job: 1})

In [1560]:
train_data.drop(train_data.job.name, axis=1, inplace=True)
train_data.drop(train_data.Id.name, axis=1, inplace=True)

for job in jobs_dict.keys():
    train_data[job] = train_data[job].fillna(0)
    train_data[job] = train_data[job].astype(int)

train_data.head(10)

Unnamed: 0,algebra,programming,data science,robotics,economics,salary,junior developer,data scientist,developer,economist,robotics engineer,senior developer
0,87,62,86,61,90,140000,1,0,0,0,0,0
1,76,84,76,80,79,780000,0,1,0,0,0,0
2,56,55,99,82,98,210000,0,0,1,0,0,0
3,99,66,65,84,58,420000,0,0,0,1,0,0
4,73,87,56,84,73,760000,0,1,0,0,0,0
5,59,91,52,63,54,790000,0,0,0,0,1,0
6,73,56,82,60,88,510000,0,0,0,0,0,1
7,79,52,95,83,94,450000,0,0,0,1,0,0
8,91,95,99,62,96,380000,0,0,1,0,0,0
9,98,93,66,94,64,350000,0,0,1,0,0,0


In [1561]:
predict_data.drop(predict_data.job.name, axis=1, inplace=True)
ids = predict_data.Id
predict_data.drop(predict_data.Id.name, axis=1, inplace=True)

for job in jobs_dict.keys():
    predict_data[job] = predict_data[job].fillna(0)
    predict_data[job] = predict_data[job].astype(int)

predict_data.head()

Unnamed: 0,algebra,programming,data science,robotics,economics,salary,junior developer,data scientist,developer,economist,robotics engineer,senior developer
0,73,59,57,54,61,0,0,0,0,0,1,0
1,77,80,53,93,80,0,0,0,0,0,0,1
2,95,72,88,63,84,0,0,0,1,0,0,0
3,83,88,97,75,50,0,0,0,0,0,1,0
4,52,85,92,87,62,0,1,0,0,0,0,0


In [1562]:
# разбиваю данные на X & y

X_train = train_data.drop(train_data.salary.name, axis=1)
y_train = train_data.salary

X_predict = predict_data.drop(predict_data.salary.name, axis=1)

In [1563]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = pd.DataFrame(scaler.transform(X_train))

X_train_scaled.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.864853,-0.862748,0.812448,-0.936176,1.067866,2.266047,-0.443272,-0.452213,-0.446677,-0.454884,-0.444886
1,0.104661,0.655675,0.115074,0.386751,0.304085,-0.441297,2.25595,-0.452213,-0.446677,-0.454884,-0.444886
2,-1.277506,-1.345883,1.719034,0.526007,1.623343,-0.441297,-0.443272,2.211346,-0.446677,-0.454884,-0.444886


In [1564]:
scaler = preprocessing.StandardScaler().fit(X_predict)

X_predict_scaled = pd.DataFrame(scaler.transform(X_predict))

X_predict_scaled.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.099726,-1.164134,-1.302828,-1.413091,-0.929077,-0.45897,-0.463753,-0.431555,-0.444528,2.266047,-0.442913
1,0.180207,0.323532,-1.589637,1.32573,0.357827,-0.45897,-0.463753,-0.431555,-0.444528,-0.441297,2.257778
2,1.439909,-0.243198,0.919939,-0.781056,0.628755,-0.45897,-0.463753,2.317202,-0.444528,-0.441297,-0.442913


In [1565]:
y_train.head(3)

0    140000
1    780000
2    210000
Name: salary, dtype: int64

In [1566]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train_scaled, y_train)

RandomForestRegressor(n_jobs=-1)

In [1567]:
y_predict = pd.DataFrame(ids)
y_predict['salary'] = model.predict(X_predict_scaled)
y_predict.loc[y_predict.salary > 1000000, y_predict.salary.name] = 1000000

y_predict.to_csv('salary_submition.csv', sep=',', header=True, index=False)

y_predict

Unnamed: 0,Id,salary
0,9000,690000.0
1,9001,553600.0
2,9002,328800.0
3,9003,1000000.0
4,9004,145600.0
...,...,...
995,9995,181700.0
996,9996,368300.0
997,9997,329500.0
998,9998,134100.0
