In [1]:
# References:
# https://github.com/speakleash/speakleash
# https://github.com/speakleash/speakleash-postprocesor/blob/main/postprocessor/analyzer.py
#https://github.com/mglabska/speakleash_filters/blob/master/quality.py

import numpy as np
import pandas as pd
import pickle
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC


In [2]:
with open("to_train.pkl","rb") as f:
    df0 = pickle.load(f)

In [3]:
df0['lang_code'] = LabelEncoder().fit_transform(df0['lang'])
df0['quality_code'] = LabelEncoder().fit_transform(df0['quality'])
df = df0.drop_duplicates(subset=['text'], ignore_index=True)

##### Wybór cech do modelu

In [11]:
df = df.select_dtypes(include=['int64','float64', 'int32','float32'])
for col in df.columns:
    print(col, df[col].var())

fig = px.bar(np.log(df.drop('quality_code', axis=1).var()))
fig.write_html(f"imgs/var_bar.html")

sentences 32372302.623521484
words 8551123140.282231
verbs 225290738.56488192
nouns 377346153.0335532
punctuations 268655708.1473618
symbols 2031830.2583450757
stopwords 1926608237.6859195
oovs 35954693.12028914
characters 312368892141.2147
avg_sentence_length 203.9210788118909
adverbs 41522394.905414954
adjectives 41167184.26653016
avg_word_length 2437.653466437213
noun_ratio 0.004411648991665757
verb_ratio 0.0022266927574530805
adj_ratio 0.001875892500723232
lexical_density 0.03776475911294878
gunning_fog 22.83365034460358
pos_x 1384252.749069247
pos_num 984827.8313246252
camel_case 37018.86700152373
capitalized_words 1901337.8186562927
lang_code 0.02264209092904807
quality_code 0.34048139999926536


In [12]:
fig = px.imshow(df.corr())
fig.write_html(f"imgs/corr_matrix.html")

##### SVC

In [6]:
X=df[['sentences', 'words', 'verbs', 'nouns', 'punctuations', 'characters',
       'symbols', 'stopwords', 'pos_x']].values

y = df['quality_code'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = make_pipeline(MinMaxScaler(), SVC(gamma='scale', kernel='linear'))
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=True))
print(f'cohen_kappa: {cohen_kappa_score(y_test, y_pred)}')

              precision    recall  f1-score   support

           0       1.00      0.00      0.00      2070
           1       0.58      1.00      0.74      3505
           2       1.00      0.00      0.00       424

    accuracy                           0.58      5999
   macro avg       0.86      0.33      0.25      5999
weighted avg       0.76      0.58      0.43      5999

cohen_kappa: 0.0


##### Random Forest

In [7]:
X=df[['sentences', 'words', 'verbs', 'nouns', 'punctuations', 'characters',
       'symbols', 'stopwords', 'pos_x']].values

y = df['quality_code'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf2 = make_pipeline(MinMaxScaler(), RandomForestClassifier())
clf2.fit(X_train, y_train)

y_pred = clf2.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=True))
print(f'cohen_kappa: {cohen_kappa_score(y_test, y_pred)}')

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      2070
           1       0.92      0.95      0.93      3505
           2       0.76      0.59      0.66       424

    accuracy                           0.89      5999
   macro avg       0.84      0.80      0.82      5999
weighted avg       0.89      0.89      0.89      5999

cohen_kappa: 0.7889740193539279
