In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import missingno as msno
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
pd.options.mode.chained_assignment = None

import joblib
from imblearn.over_sampling import RandomOverSampler
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MaxAbsScaler
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

np.random.seed(42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def build_new_model(df, rating, review, output_model, language = 'russian'):
  def process_string(string): 
    string = str(string).lower()
    string = ' '.join([word for word in string.split() if word not in sw])
    return string

  def build_str(df, label):
    data = df[df[rating] == label]
    return ' '.join([process_string(s) for s in data[review]])

  df = df.dropna()

  df[rating].astype(str).astype(int)

  criteria = [df[rating].between(1, 2), df[rating] == 3, df[rating].between(4, 6)]
  values = [-1, 0 ,1]
  df[rating] = np.select(criteria, values, 0)
  oversample = RandomOverSampler()
  df, y = oversample.fit_resample(df, df[rating])

  sw = set(stopwords.words(language))
  df[review] = df[review].map(process_string)

  X = df.drop(rating, axis="columns")
  y = df[rating]    

  trainX, testX, trainY, testY = train_test_split(X[review], y, test_size=0.3, random_state=42)
  trainX.shape, testX.shape, trainY.shape, testY.shape

  cnt_vec = CountVectorizer(ngram_range=(2, 2))

  bow = cnt_vec.fit_transform(trainX) 
  bow_test = cnt_vec.transform(testX)
  scaler = MaxAbsScaler()
  bow = pd.DataFrame.sparse.from_spmatrix(scaler.fit_transform(bow), trainX.index)
  bow_test = pd.DataFrame.sparse.from_spmatrix(scaler.transform(bow_test), testX.index)

  clf = LogisticRegression()
  clf.fit(bow, trainY)
  pred = clf.predict(bow_test)
  print(classification_report(testY, pred))
  accuracy_score(testY, pred)
  joblib.dump(clf, output_model + ".pkl")

  return df

In [None]:
data = '/content/drive/MyDrive/Colab Notebooks/emlife.xlsx'
rating = "Рейтинг"
review = "Комментарий"
new_model_name = "new_model"

df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/emlife.xlsx')
df_res = prepare(df, rating, review, new_model_name)

              precision    recall  f1-score   support

          -1       0.97      0.39      0.55       967
           0       0.99      0.46      0.63       941
           1       0.45      0.99      0.62       926

    accuracy                           0.61      2834
   macro avg       0.80      0.61      0.60      2834
weighted avg       0.81      0.61      0.60      2834



In [None]:
def predict(new_model, df, review, output_file):
  def process_string(string): 
    string = str(string).lower()
    string = ' '.join([word for word in string.split() if word not in sw])
    return string

  def build_str(df, label):
    data = df[df[rating] == label]
    return ' '.join([process_string(s) for s in data[review]])

  if (new_model[-4:-1] == ".pkl"):
    classify = joblib.load(open(new_model, 'rb'))
  else:
    classify = joblib.load(open(new_model + ".pkl", 'rb'))

  sw = set(stopwords.words('russian'))
  df[review] = df[review].map(process_string)

  testX = df[review]

  cnt_vec = CountVectorizer(ngram_range=(2, 2))

  bow = cnt_vec.fit_transform(testX)
  #scaler = MaxAbsScaler()
  #bow = pd.DataFrame.sparse.from_spmatrix(scaler.transform(bow), testX.index)

  prediction = classify.predict(bow)

In [None]:
data = '/content/drive/MyDrive/Colab Notebooks/emlife.xlsx'
review = "Комментарий"
new_model_name = "new_model"
output_file = 'output'

df = pd.read_excel(data)
df = predict(new_model_name, df, review)

ValueError: ignored