## Предобработка данных

In [115]:
import pandas as pd
import numpy as np

In [2]:
groups = pd.read_table('groups.txt')
groups.head()

Unnamed: 0,doc_id,variant,group_id,chain_id,link,shift,length,content,tk_shifts,attributes,head,hd_shifts
0,1,1,407840,1070,0,9,5,своих,9,ref:def|str:refl|type:coref,,
1,1,1,407839,1070,407840,47,1,я,47,ref:def|str:pron|type:coref,,
2,1,1,407842,1069,0,69,13,одинокую дачу,6978,ref:def|str:noun|type:coref,дачу,78.0
3,1,1,407841,1069,407842,118,9,этой даче,118123,ref:def|str:noun|type:coref,даче,123.0
4,1,1,407843,1069,407841,166,3,она,166,ref:def|str:pron|type:coref,,


In [3]:
new_groups = groups[groups.chain_id > 0].sort_values(by='chain_id') #отсортировали данные по chain_id

In [4]:
new_groups.loc[:, 'first'] = 0

In [5]:
group_ids = new_groups.groupby("chain_id").aggregate(np.min)['shift'].get_values() # группируем по chain_id и выбираем минимум по shift

for i in range(len(group_ids)):
    new_groups.loc[new_groups[(new_groups['chain_id'] == i+1)&(new_groups['shift'] == group_ids[i])].index, 'first'] = 1

### Получаем списки прилагательных

In [30]:
A = new_groups[(new_groups['first'] == 1)&(new_groups['head'] != None)]['content'].to_string().split()
A = [x.lower() for x in A if x.isalpha()]

In [6]:
from pymorphy2 import MorphAnalyzer

In [7]:
morph = MorphAnalyzer()

In [10]:
adj = []
for word in A:
    p = morph.parse(word)[0]
    if p.tag.POS == 'ADJF':
        if p.normal_form not in adj:
            adj.append(p.normal_form)

In [11]:
f = open('first_adjectives.txt', 'w', encoding = 'utf-8')
for item in adj:
    f.write("%s\n" % item)
f.close()

In [12]:
B = new_groups[(new_groups['first'] == 0)&(new_groups['head'] != None)]['content'].to_string().split()
B = [x.lower() for x in B if x.isalpha()]

In [13]:
adj_nf = []
for word in B:
    p = morph.parse(word)[0]
    if p.tag.POS == 'ADJF':
        if p.normal_form not in adj_nf:
            adj_nf.append(p.normal_form)

In [14]:
f = open('not_first_adjectives.txt', 'w', encoding = 'utf-8')
for item in adj_nf:
    f.write("%s\n" % item)
f.close()

### Подготовка к baseline

In [8]:
documents = pd.read_table('Documents.txt')

#### Признак "совпадение именной группы"

In [9]:
new_groups['content coincidence'] = 0

In [10]:
for i in range(181): # идём по документам
    f = open(documents['path'][i], 'r', encoding = 'utf-8' )
    text = f.read()
    f.close()
    indexes = new_groups[new_groups['doc_id'] == documents['doc_id'][i]]['content'].index 
    for index in indexes: # идём по content'ам
        if text.find(new_groups['content'][index]) == new_groups['shift'][index]: #находим первое упоминание content в тексте и сверяем с настоящим местом content (shift) 
            new_groups.loc[index, 'content coincidence'] = 1 # если первое упоминание совпало с шифтом, то признак 1

#### Признак "совпадение вершины"

In [11]:
new_groups['head coincidence'] = 0

In [12]:
for i in range(181):
    f = open(documents['path'][i], 'r', encoding = 'utf-8')
    text = f.read()
    f.close()
    indexes = new_groups[new_groups['doc_id'] == documents['doc_id'][i]]['head'].index
    for index in indexes:
        if str(text.find(str(new_groups['head'][index]))) == new_groups['hd_shifts'][index]:
            new_groups.loc[index, 'head coincidence'] = 1

#### Признак "является ли местоимением"

In [13]:
new_groups['is_pron'] = 1

In [14]:
indexes = new_groups['content'].index 
for index in indexes: # идём по contentам
    p = morph.parse(new_groups['content'][index])[0]
    if p.tag.POS != 'NOUN': #если ИГ -  не существительное,  
        new_groups.loc[index, 'is_pron'] = 0 # то признак = 0 (тогда вероятно, что это НЕ первое упоминание)

### Baseline (без балансировки данных)

In [15]:
X = new_groups[['content coincidence', 'head coincidence', 'is_pron']]
Y = new_groups['first']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=12345)

#### Логистическая Регресссия

In [20]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()

In [98]:
lr_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [99]:
y_pred = lr_clf.predict(X_test)

In [100]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.80      0.96      0.87      3828
    class 1       0.58      0.21      0.31      1122

avg / total       0.75      0.79      0.74      4950



#### SVC

In [103]:
from sklearn.svm import SVC

lin_svm = SVC(kernel='linear', C=10)

In [104]:
lin_svm.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [106]:
y_pred1 = lin_svm.predict(X_test)

In [107]:
print(classification_report(y_test, y_pred1, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.82      0.94      0.88      3828
    class 1       0.59      0.28      0.38      1122

avg / total       0.77      0.79      0.76      4950



#### Random Forest

In [108]:
from sklearn.ensemble import RandomForestClassifier

In [109]:
rf_clf = RandomForestClassifier(random_state=12345)

In [110]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=12345, verbose=0,
            warm_start=False)

In [113]:
y_pred2 = rf_clf.predict(X_test)

In [114]:
print(classification_report(y_test, y_pred2, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.82      0.94      0.88      3828
    class 1       0.59      0.28      0.38      1122

avg / total       0.77      0.79      0.76      4950



#### KNN

In [121]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

In [122]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [123]:
y_pred3 = knn.predict(X_test)

In [125]:
print(classification_report(y_test, y_pred3, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.86      0.83      0.85      3828
    class 1       0.49      0.55      0.52      1122

avg / total       0.78      0.77      0.77      4950



### Балансируем данные с помощью SMOTE

In [116]:
from imblearn.over_sampling import SMOTE

In [117]:
sm = SMOTE(ratio = 0.3, kind='borderline1', random_state=12345)
X_res, y_res = sm.fit_sample(X_train, y_train)



In [118]:
#cтало - было
[[len(X_train.values), sum(y_train.values==0), sum(y_train.values==1)],[len(y_res), sum(y_res==0), sum(y_res==1)]]

[[11548, 9038, 2510], [11749, 9038, 2711]]

#### Логистическая регрессия

In [127]:
lr_clf.fit(X_res, y_res)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [128]:
y_pred = lr_clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.80      0.96      0.87      3828
    class 1       0.58      0.21      0.31      1122

avg / total       0.75      0.79      0.74      4950



#### KNN

In [129]:
knn.fit(X_res, y_res)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [130]:
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.78      0.99      0.87      3828
    class 1       0.63      0.04      0.07      1122

avg / total       0.75      0.78      0.69      4950



#### SVC

In [131]:
lin_svm.fit(X_res, y_res)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [132]:
y_pred = lin_svm.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.82      0.94      0.88      3828
    class 1       0.59      0.28      0.38      1122

avg / total       0.77      0.79      0.76      4950



#### Random forest

In [133]:
rf_clf.fit(X_res, y_res)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=12345, verbose=0,
            warm_start=False)

In [134]:
y_pred = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.82      0.94      0.88      3828
    class 1       0.59      0.28      0.38      1122

avg / total       0.77      0.79      0.76      4950



#### Выводы по балансировке

Для 3/4 рассмотренных классификаторов результаты после балансировки никак не изменились, для 1 (а именно, KNN), результаты резко ухудшились

## Другие эксперименты

### Длина

In [158]:
new_groups['len'] = 0

In [159]:
new_groups.length.mean()

10.011031640198812

In [160]:
new_groups.loc[new_groups.length >= 10, 'len'] = 1 #4911

In [161]:
X1 = new_groups[['content coincidence', 'head coincidence', 'is_pron', 'len']]

In [162]:
X_train, X_test, y_train, y_test = train_test_split(
    X1, Y, test_size=0.3, random_state=12345)

#### KNN

In [163]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [164]:
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.84      0.89      0.86      3828
    class 1       0.53      0.44      0.48      1122

avg / total       0.77      0.78      0.78      4950



#### Логистическая регрессия

In [166]:
lr_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [168]:
y_pred = lr_clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.83      0.90      0.86      3828
    class 1       0.52      0.37      0.43      1122

avg / total       0.76      0.78      0.77      4950



#### SVC

In [169]:
lin_svm.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [170]:
y_pred = lin_svm.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.82      0.94      0.88      3828
    class 1       0.59      0.28      0.38      1122

avg / total       0.77      0.79      0.76      4950

