In [39]:
import pandas as pd
import numpy as np

In [40]:
data_r = pd.read_csv("../data/stg/sentiment/rureviews.csv")
data_s = pd.read_csv("../data/stg/sentiment/senteval2016.csv")

In [94]:
data_s.sentiment.value_counts()

 0    49943
-1     5702
 1     2334
Name: sentiment, dtype: int64

In [41]:
X_r, y_r = data_r["review"], data_r["sentiment"]
X_s, y_s = data_s["review"], data_s["sentiment"]

In [42]:
from sklearn.model_selection import train_test_split

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_r, y_r, test_size=.3, shuffle=True, random_state=17,
    stratify=y_r
)

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_s, y_s, test_size=.3, shuffle=True, random_state=17, stratify=y_s
)

In [56]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

model_r = make_pipeline(
    TfidfVectorizer(max_df=.3, min_df=5),
    TruncatedSVD(n_components=200,),
    StandardScaler(),
    LogisticRegression(random_state=17, n_jobs=-1, C=1, max_iter=1000)
)
model_s = clone(model_r)
model_r

In [57]:
model_r.fit(X_train_r, y_train_r)


In [67]:
preds_r = model_r.predict(X_test_r)
probs_r = model_r.predict_proba(X_test_r)

preds_rs = model_r.predict(X_train_s)

In [59]:
model_s.fit(X_train_s, y_train_s)

In [66]:
preds_s = model_s.predict(X_test_s)
probs_s = model_s.predict_proba(X_test_s)
preds_sr = model_s.predict(X_train_r)

In [61]:
from sklearn.metrics import classification_report, f1_score

In [62]:
print(classification_report(y_test_r, preds_r))

              precision    recall  f1-score   support

          -1       0.69      0.66      0.68      8993
           0       0.57      0.63      0.60      8974
           1       0.82      0.77      0.79      8991

    accuracy                           0.69     26958
   macro avg       0.69      0.69      0.69     26958
weighted avg       0.69      0.69      0.69     26958



In [63]:
print(classification_report(y_test_s, preds_s))

              precision    recall  f1-score   support

          -1       0.58      0.23      0.33      1711
           0       0.88      0.98      0.93     14983
           1       0.39      0.03      0.05       700

    accuracy                           0.87     17394
   macro avg       0.62      0.41      0.44     17394
weighted avg       0.83      0.87      0.83     17394



In [68]:
print(classification_report(y_train_s, preds_rs))

              precision    recall  f1-score   support

          -1       0.23      0.31      0.27      3991
           0       0.89      0.79      0.84     34960
           1       0.08      0.20      0.12      1634

    accuracy                           0.72     40585
   macro avg       0.40      0.44      0.41     40585
weighted avg       0.79      0.72      0.75     40585



In [69]:
print(classification_report(y_train_r, preds_sr))

              precision    recall  f1-score   support

          -1       0.57      0.24      0.34     20984
           0       0.34      0.87      0.49     20937
           1       0.95      0.01      0.01     20979

    accuracy                           0.37     62900
   macro avg       0.62      0.37      0.28     62900
weighted avg       0.62      0.37      0.28     62900



In [70]:
reviews = pd.read_csv("../data/stg/reviews/moscow_restoraunts.csv")

In [71]:
from src_rest.transformers.utils import clear_texts

In [72]:
reviews_sentence = (
    reviews.assign(review=lambda x: x.review.str.split('.'))
    .explode("review"))
reviews_sentence["review_norm"] = clear_texts(reviews_sentence.review)

  return texts.str.lower().str.replace(PATTERN, "")


In [73]:
scoring_r = pd.DataFrame(
    model_r.predict_proba(reviews_sentence.review_norm),
    columns=['negative', 'neutral', 'positive'],
)

scoring_r["sentiment"] = (
    model_r.predict(reviews_sentence.review_norm)
)

scoring_s = pd.DataFrame(
    model_s.predict_proba(reviews_sentence.review_norm),
    columns=['negative', 'neutral', 'positive'],
)

scoring_s["sentiment"] = (
    model_s.predict(reviews_sentence.review_norm)
)

In [91]:
scoring_r.sentiment.value_counts()

 0    24438
-1     5276
 1     2785
Name: sentiment, dtype: int64

In [92]:
scoring_s.sentiment.value_counts()

 0    30744
-1     1733
 1       22
Name: sentiment, dtype: int64

In [79]:
result = reviews_sentence.join(scoring_r.add_suffix("_rureviews")).join(scoring_s.add_suffix("_senteval2016"))

In [82]:
result.sample(20, random_state=17).to_excel("reviews_sample.xlsx")

In [None]:
# r 0.33 error
# s 0.25 error

In [85]:
# 60% по негативным отзывам!
# по позитивным ничего не можем сделать

result.loc[
    result.sentiment_senteval2016.isin([1, -1])
].sample(20, random_state=17).to_excel("reviews_senteval2016.xlsx")

In [87]:
# 45% по негативным отзывам
# 44% по позитивным отзывам

result.loc[
    result.sentiment_rureviews.isin([1, -1])
].sample(20, random_state=17).to_excel("reviews_rureviews.xlsx")

In [89]:
4 / 9

0.4444444444444444

In [90]:
12 / 20

0.6