In [2]:
import eli5.sklearn
import pandas as pd
import json

In [3]:
df = pd.read_csv('/Users/m.e.zubkova/Documents/diploma_final.csv')

In [4]:
male_names = pd.read_json('/Users/m.e.zubkova/Downloads/nlp-gender/male_names.json')
female_names = pd.read_json('/Users/m.e.zubkova/Downloads/nlp-gender/female_names.json')

In [5]:
def sex_classify(name: str):
    if name in male_names.values:
        return 'male'
    if name in female_names.values:
        return 'female'
    return 'not in list of names'

In [6]:
df['freelancer_first_name'] = df.freelancer_name.apply(lambda name: name.split()[0])

In [7]:
df['freelancer_gender'] = df.freelancer_first_name.apply(sex_classify)

Часть имен мы вручную вписали в словарь для определения пола большего числа фрилансеров.

In [8]:
# определим гендер для случаев, когда пользователи поменяли местами имя и фамилию

df.loc[
    (df['freelancer_gender'] == 'not in list of names'), 'freelancer_gender'
] = df.freelancer_name.apply(
    lambda name: sex_classify(name.split()[1]) if len(name.split()) > 1 else 'not in list of names')

In [9]:
df[df['freelancer_gender'] == 'not in list of names'].shape

# без гендера осталось 604 отзыва, их мы не будем учитывать в последующем анализе

(604, 7)

In [10]:
gendered = df[df['freelancer_gender'] != 'not in list of names']
gendered.freelancer_gender.value_counts()

female    6444
male      3781
Name: freelancer_gender, dtype: int64

In [11]:
gendered[gendered.texts == 'Нет отзыва'].freelancer_gender.value_counts()

female    911
male      334
Name: freelancer_gender, dtype: int64

In [12]:
def no_text(texts: pd.Series):
    """
    Функция подсчитывает долю отзывов без текста от всех отзывов (необходимо наличие оценки)
    """
    return len(texts[texts == 'Нет отзыва']) / len(texts)

In [13]:
gendered.groupby('freelancer_gender').agg({'texts': no_text})

# доля отзывов без текста у мужчин ниже, воспользуемся критерием хи-квадрат для проверки

Unnamed: 0_level_0,texts
freelancer_gender,Unnamed: 1_level_1
female,0.141372
male,0.088336


In [14]:
gendered['texts_existence'] = gendered.texts.apply(lambda text: text != 'Нет отзыва')
ct1 = pd.crosstab(gendered['freelancer_gender'], gendered['texts_existence'])
ct1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gendered['texts_existence'] = gendered.texts.apply(lambda text: text != 'Нет отзыва')


texts_existence,False,True
freelancer_gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,911,5533
male,334,3447


In [15]:
from scipy.stats import chi2_contingency

In [16]:
chi2_contingency(ct1)

# тест хи-квадрат показывает, что с вероятностью 99% наличие текста отзыва в отзывах на мужчин-фрилансеров и на женщин-фрилансерок значимо различается

(62.182501680642176,
 3.130573149307869e-15,
 1,
 array([[ 784.62396088, 5659.37603912],
        [ 460.37603912, 3320.62396088]]))

In [17]:
# почистим от текстов без отзывов

with_reviews = gendered[gendered.texts != 'Нет отзыва']

In [18]:
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

In [19]:
mystem = Mystem()
russian_stopwords = stopwords.words("russian")

In [20]:
# базово предобработаем тексты

def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords
              and token != " "
              and token.strip() not in punctuation]

    return tokens

In [21]:
with_reviews['texts_preprocessed'] = with_reviews.texts.apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_reviews['texts_preprocessed'] = with_reviews.texts.apply(preprocess_text)


In [22]:
set_of_names = set(map(lambda x: x.lower(), female_names.T.values.tolist()[0])) | set(map(lambda x: x.lower(), male_names.T.values.tolist()[0]))

In [23]:
# проверим наличие имен в текстах отзывов
with_reviews['name_in_review'] = with_reviews.texts_preprocessed.apply(lambda text: len(set(text) & set_of_names) > 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_reviews['name_in_review'] = with_reviews.texts_preprocessed.apply(lambda text: len(set(text) & set_of_names) > 0)


In [24]:
ct2 = pd.crosstab(with_reviews['freelancer_gender'], with_reviews['name_in_review'])
ct2

name_in_review,False,True
freelancer_gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,3037,2496
male,2121,1326


In [25]:
chi2_contingency(ct2)

# тест хи-квадрат показывает, что с вероятностью 99% наличие имени в отзывах на мужчин-фрилансеров и на женщин-фрилансерок значимо различается

(38.06627389047619,
 6.838196863551224e-10,
 1,
 array([[3178.08619154, 2354.91380846],
        [1979.91380846, 1467.08619154]]))

In [26]:
ct3 = pd.crosstab(with_reviews['marks'], with_reviews['name_in_review'])
ct3

name_in_review,False,True
marks,Unnamed: 1_level_1,Unnamed: 2_level_1
1,151,47
2,66,26
3,33,11
4,88,20
5,1690,1047
Пять с плюсом,3130,2671


In [27]:
chi2_contingency(ct3)

# имена в отзывах чаще встречаются в положительных отзывах

(117.0366358159197,
 1.3310762579059365e-23,
 5,
 array([[ 113.72873051,   84.27126949],
        [  52.84365256,   39.15634744],
        [  25.27305122,   18.72694878],
        [  62.03385301,   45.96614699],
        [1572.0986637 , 1164.9013363 ],
        [3332.022049  , 2468.977951  ]]))

In [29]:
# посмотрим на наличие слов, указывающих на гендер фрилансера в отзывах
gendered_words = ['девушка', 'женщина', 'мужчина', 'человек']

In [34]:
with_reviews['gender_in_text'] = with_reviews.texts_preprocessed.apply(lambda tokens: len(set(tokens) & set(gendered_words)) != 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_reviews['gender_in_text'] = with_reviews.texts_preprocessed.apply(lambda tokens: len(set(tokens) & set(gendered_words)) != 0)


In [41]:
ct4 = pd.crosstab(with_reviews['freelancer_gender'], with_reviews['gender_in_text'])
ct4

gender_in_text,False,True
freelancer_gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,5124,409
male,3247,200


In [42]:
chi2_contingency(ct4)

# с вероятностью 99% встречаемость слов, указывающих на гендер фрилансеров в текстах отзывов, различается

(8.24225694242785,
 0.004092621120725573,
 1,
 array([[5157.76648107,  375.23351893],
        [3213.23351893,  233.76648107]]))

In [85]:
# удалим имена из текстов отзывов

def del_name(tokens):
    return [token for token in tokens if token not in set_of_names]

In [93]:
with_reviews['texts_preprocessed'] = with_reviews['texts_preprocessed'].apply(del_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_reviews['texts_preprocessed'] = with_reviews['texts_preprocessed'].apply(del_name)


Построим модель на наших данных. Зависимая переменная - оценка, независимая - эмбеддинги текстов отзывов. В качестве эмбеддера мы возьмем предобученные SOTA-эмбеддинги navec для русского языка из библиотеки Natasha. https://github.com/natasha/navec

In [47]:
from navec import Navec

In [54]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'

In [55]:
navec = Navec.load(path)

In [66]:
np.mean(list(map(lambda word: navec[word], with_reviews.texts_preprocessed.loc[0])), axis=0)

dtype('float32')

In [61]:
import numpy as np

In [72]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import f1_score, mean_squared_error, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler

In [75]:
with_reviews.loc[with_reviews.marks == 'Пять с плюсом', 'marks'] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [77]:
with_reviews['marks'] = with_reviews.marks.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_reviews['marks'] = with_reviews.marks.astype(int)


In [94]:
with_reviews['texts_joined'] = with_reviews.texts_preprocessed.apply(lambda tokens: " ".join(tokens))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_reviews['texts_joined'] = with_reviews.texts_preprocessed.apply(lambda tokens: " ".join(tokens))


In [96]:
women = with_reviews[with_reviews.freelancer_gender == 'female'][['texts_joined', 'marks']]
men = with_reviews[with_reviews.freelancer_gender == 'male'][['texts_joined', 'marks']]

In [97]:
x_train_m, x_test_m, y_train_m, y_test_m = train_test_split(men.texts_joined, men.marks, random_state=42)
x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(women.texts_joined, women.marks, random_state=42)

In [144]:
vec_m = TfidfVectorizer(ngram_range=(1, 3))
vec_train_m = vec_m.fit_transform(x_train_m)
vec_test_m = vec_m.transform(x_test_m)


scaler_m = MaxAbsScaler()
vec_train_m = scaler_m.fit_transform(vec_train_m)
vec_test_m = scaler_m.transform(vec_test_m)

In [145]:
linear_m = LinearRegression()
linear_m.fit(vec_train_m, y_train_m)
preds_m = linear_m.predict(vec_test_m)

In [146]:
mean_squared_error(preds_m, y_test_m)

0.3667394686425137

In [103]:
import eli5

In [147]:
eli5.sklearn.explain_weights_sklearn(linear_m, vec=vec_m)

Weight?,Feature
+5.211,<BIAS>
+1.202,хороший рекомендовать
+1.050,спасибо рекомендовать
+0.983,быстро качественно выполнять
+0.937,отлично рекомендовать
… 29481 more positive …,… 29481 more positive …
… 17563 more negative …,… 17563 more negative …
-0.827,четко срок
-0.831,относиться работа
-0.957,предоплата


In [148]:
vec_f = TfidfVectorizer(ngram_range=(1, 3))
vec_train_f = vec_f.fit_transform(x_train_f)
vec_test_f = vec_f.transform(x_test_f)


scaler_f = MaxAbsScaler()
vec_train_f = scaler_f.fit_transform(vec_train_f)
vec_test_f = scaler_f.transform(vec_test_f)

In [149]:
linear_f = LinearRegression()
linear_f.fit(vec_train_f, y_train_f)
preds_f = linear_f.predict(vec_test_f)

In [150]:
mean_squared_error(preds_f, y_test_f)

0.6104155488984659

In [151]:
eli5.sklearn.explain_weights_sklearn(linear_f, vec=vec_f)

Weight?,Feature
+4.824,<BIAS>
+1.176,молодец
+1.176,шикарно
+1.176,волшебница
+1.176,бал
+1.176,четко
+1.176,талант
+1.176,проходить
+1.176,очень
… 47661 more positive …,… 47661 more positive …


In [None]:
негатив: безответственный, безответственный человек, недобросовестный специалист, относиться к работе
поозитив: быстро качественно выполнять,