In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Загружаем незаполненные данные

In [2]:
df = pd.read_csv('training_df_unfilled_2.09.csv')

## Заполняем

In [3]:
df.contract_penalty_percentage.fillna(0.001, inplace=True)

In [180]:
# training_df.delay.fillna(training_df.delay.median(), inplace=True)

Решили заполнять просрочку распределением Парето и переводить в диапозон от 7 до 1007.

In [4]:
for i, val in enumerate(df.delay):
    if np.isnan(val):
        new_value = round(np.random.pareto(10) * 1000) + 7
        df['delay'].iloc[i] = new_value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['delay'].iloc[i] = new_value


### Для заполнения цены контракта попробуем обучать модель

In [5]:
df['undefined_contract_price'] = df.contract_price.isnull()

In [6]:
df.shape

(125, 12)

In [7]:
training = df[df.undefined_contract_price == False]

In [8]:
training.shape

(103, 12)

Признаки:
- claimed_penalty
- debt_amount
- type_of_contract

In [9]:
training['debt_amount'].fillna(training.debt_amount.median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training['debt_amount'].fillna(training.debt_amount.median(), inplace=True)


In [10]:
training = training[(training.contract_price < 10 ** 7)]

#### Переводим категориальные данные в one-hot вектора

In [11]:
def transform_into_onehot(column_name, df):
    encoding_dict = {
        c: i for (i, c) in enumerate(df[column_name].unique())
    }
    encoded = np.zeros((len(df), len(encoding_dict)))
    for i, val in enumerate(df[column_name]):
        one_place = encoding_dict[val]
        encoded[i][one_place] = 1
    return encoded

In [12]:
encoded_type_of_contract = transform_into_onehot('type_of_contract', training)

In [13]:
features = np.hstack([training[['claimed_penalty', 'debt_amount']].values])

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    features, training.contract_price, random_state=42)

In [17]:
model = KNeighborsRegressor()

In [18]:
model.fit(X_train, y_train)

In [19]:
pred = model.predict(X_test)

In [20]:
metrics.mean_squared_error(y_test, pred)

6615024895311.943

In [21]:
metrics.mean_absolute_error(y_test, pred)

1882265.7789473683

In [22]:
results = pd.DataFrame({'real': y_test, 'predicted': pred})
results

Unnamed: 0,real,predicted
10,400281.0,955747.0
107,148180.0,153660.0
26,6500587.0,2300909.2
2,5802301.0,2553831.4
70,1122210.0,5758836.2
104,4727543.0,2394273.6
54,107800.0,4513978.2
29,5093668.0,3609017.4
116,165745.0,274288.0
108,837978.0,1143560.8


#### Заполняем пропуски по цене конракта

In [23]:
debt_default_value = training.debt_amount.median()

In [24]:
debt_default_value

851388.5

In [25]:
for i, (val, debt, claimed_penalty) in enumerate(df[['contract_price', 'debt_amount', 'claimed_penalty']].values):
    if np.isnan(val):
        debt = debt if not np.isnan(debt) else debt_default_value
        features = [[claimed_penalty, debt]]
        new_value = model.predict(features)[0]
        df['contract_price'].iloc[i] = new_value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['contract_price'].iloc[i] = new_value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['contract_price'].iloc[i] = new_value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['contract_price'].iloc[i] = new_value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['contract_price'].iloc[i] = new_value
A va

### Для заполнения debt_amount попробуем обучать модель

In [6]:
df.shape

(125, 12)

In [26]:
training = df[df.debt_amount.notnull()]
training.shape

(99, 12)

Признаки:
- claimed_penalty
- contract_price
- delay (?)

In [27]:
training = training[(training.contract_price < 10 ** 7)]

In [29]:
training.shape

(81, 12)

In [56]:
features = np.hstack([training[['claimed_penalty', 'contract_price']].values])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(
    features, training.debt_amount, random_state=42)

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

In [59]:
model = KNeighborsRegressor()

In [60]:
model.fit(X_train, y_train)

In [61]:
pred = model.predict(X_test)

In [62]:
metrics.mean_squared_error(y_test, pred)

2467534390479.176

In [63]:
metrics.mean_absolute_error(y_test, pred)

948988.9238095239

In [64]:
results = pd.DataFrame({'real': y_test, 'predicted': pred})
results

Unnamed: 0,real,predicted
52,228864.0,144949.4
5,21500.0,790816.4
39,8012460.0,3687719.6
55,224926.0,257878.6
33,4919631.0,4735957.4
47,300000.0,4722573.2
24,13645.0,244267.6
110,14840.0,689418.6
10,370281.0,240845.6
27,2820068.0,1459154.6


#### Заполняем пропуски по сумме долга

In [66]:
for i, (price, debt, claimed_penalty) in enumerate(df[['contract_price', 'debt_amount', 'claimed_penalty']].values):
    if np.isnan(debt):
        features = [[claimed_penalty, price]]
        new_value = model.predict(features)[0]
        df['debt_amount'].iloc[i] = new_value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['debt_amount'].iloc[i] = new_value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['debt_amount'].iloc[i] = new_value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['debt_amount'].iloc[i] = new_value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['debt_amount'].iloc[i] = new_value
A value is tryin

In [68]:
df.to_csv('training_data_filled_11.09.csv', index=False)