In [16]:
import pandas as pd
import numpy as np
import json
import re

from zipfile import ZipFile
from tqdm import tqdm_notebook, tqdm
from nltk import word_tokenize, sent_tokenize
from ast import literal_eval
from collections import defaultdict
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [17]:
zfile = ZipFile("processed_texts.zip")

In [18]:
processed_texts = []

for file in tqdm_notebook(zfile.filelist, total=len(zfile.filelist)):
    if file.filename.endswith(".txt"):
        with zfile.open(
            file.filename
        ) as inp:
            text = inp.read()
        with zfile.open(
            file.filename.replace(".txt",".json")
        ) as inp:
            meta = json.load(inp)
        processed_texts.append((file.filename, text, meta))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for file in tqdm_notebook(zfile.filelist, total=len(zfile.filelist)):


  0%|          | 0/38159 [00:00<?, ?it/s]

Посчитаем среднюю длину (с 95% доверительным интервалом) предложения в processed_texts и среднее количество ошибок на предложение:

In [19]:
processed_sents = [
    sent for file, text, meta in tqdm_notebook(processed_texts)
    for sent in sent_tokenize(text.decode('utf8'))
]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  sent for file, text, meta in tqdm_notebook(processed_texts)


  0%|          | 0/19079 [00:00<?, ?it/s]

In [20]:
processed_lens = [len(sent) for sent in processed_sents]
processed_err_counts = [sent.count("<<") for sent in processed_sents]

In [21]:
pd.DataFrame(processed_lens).describe()

Unnamed: 0,0
count,231029.0
mean,165.681066
std,92.759507
min,1.0
25%,100.0
50%,148.0
75%,211.0
max,1119.0


In [22]:
meta = pd.DataFrame(
    [meta for name, text, meta in processed_texts]
)
meta

Unnamed: 0,text_type,ielts,CEFR_level,task_id,ann_checked,work_type,filename,folder
0,,TRUE,,O18,0,exam,2020_MLa_2297_2,downloaded_2022_05_03_13_27_13286380/exam/Exam...
1,,TRUE,,O18,0,exam,2020_MLa_2744_2,downloaded_2022_05_03_13_27_13286380/exam/Exam...
2,,TRUE,,G15,0,exam,2020_MLa_3271_1,downloaded_2022_05_03_13_27_13286380/exam/Exam...
3,,TRUE,C1,O19,0,exam,2017_EGe_12_2,downloaded_2022_05_03_13_27_13286380/exam/Old_...
4,,TRUE,,O30,0,exam,2020_MLa_5066_2,downloaded_2022_05_03_13_27_13286380/exam/Exam...
...,...,...,...,...,...,...,...,...
19074,,TRUE,,G27,0,exam,2020_MLa_5087_1,downloaded_2022_05_03_13_27_13286380/exam/Exam...
19075,,TRUE,C1,G27,0,exam,2019_ABu_147_1,downloaded_2022_05_03_13_27_13286380/exam/Exam...
19076,,TRUE,,G20,0,exam,2020_MLa_2224_1,downloaded_2022_05_03_13_27_13286380/exam/Exam...
19077,,TRUE,B1+,G14,0,exam,2016_EKu_166_1,downloaded_2022_05_03_13_27_13286380/exam/Old_...


In [23]:
meta["CEFR_level"].value_counts()

       8443
B1+    1875
B2+    1010
B1      652
C1      501
B2      465
C1+     156
B1-     138
A2       11
Name: CEFR_level, dtype: int64

In [24]:
meta["task_id"].value_counts()

G20    1186
O18    1133
G13    1056
G24    1031
O16     988
       ... 
G01      21
O08      20
G09      15
G23      13
0         2
Name: task_id, Length: 62, dtype: int64

In [25]:
meta["ann_checked"].value_counts()

0    12678
0     5993
1      196
Name: ann_checked, dtype: int64

In [26]:
list(re.finditer(
    "#DELETE#([0-9]+)#",
    processed_texts[0][1].decode('utf-8')
))[0].span()[0]

630

In [27]:
help(re)

Help on module re:

NAME
    re - Support for regular expressions (RE).

MODULE REFERENCE
    https://docs.python.org/3.8/library/re
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    This module provides regular expression matching operations similar to
    those found in Perl.  It supports both 8-bit and Unicode strings; both
    the pattern and the strings being processed can contain null bytes and
    characters outside the US ASCII range.
    
    Regular expressions can contain both special and ordinary characters.
    Most ordinary characters, like "A", "a", or "0", are the simplest
    regular expressions; they simply match themselves.  You can
    concatenate ordinary characters, so last mat

In [28]:
def remove_deletes(s: str):
    deletes = re.finditer("#DELETE#([0-9]+)#", s)
    if deletes:
        s1 =  ''
        prev_idx = 0
        for match in sorted(list(deletes), key=lambda x: x.span()[0]):
            s1 += s[prev_idx:match.span()[0]]
            prev_idx = match.span()[1] + int(match.group(1))
        s1 += s[prev_idx:len(s)]
        return s1
    else:
        return s

def sent_tokenize_function(s: str):
    s = remove_deletes(s)
    sents = []
    sent = ''
    escaped = False
    capital = False
    prev_sym = ''
    for sym in s:
        sent += sym
        if sym == '<' and prev_sym == '<':
            escaped = True
        elif sym == '>' and prev_sym == '>':
            escaped = False
        elif sym in '?!.':
            if not (escaped or capital):
                sents.append(sent.strip())
                sent = ''
            else:
                pass
        elif capital:
            capital = False
        elif sym.isupper():
            capital = True
        prev_sym = sym
    sents.append(sent.strip())
    return sents

In [29]:
processed_texts = [
    (
        name, sent_tokenize_function(text.decode("utf-8")), meta
    ) for name, text, meta in processed_texts
]

In [30]:
processed_texts[0]

('processed_texts/15753.txt',
 ['<<Nowadays,**T1**punct**None**8**Nowadays>> we can not imagine any competitive product without <<PR-compain**T2**Articles**None**14**the PR-campain>>.',
  'But some businesses are trying not to show dangerous features of their products which can <<have a**T4**vocab**None**5**cause>> dramatic effect on customers and lead to health problems.',
  'In my point of view, it depends on the kind of <<product**T5**Articles**None**11**the product>> and it does not have to be <<immediately**T6**Spelling**None**10**immediatly>> banned and not advertised.',
  'There are a great number of different goods that can lead to diseases and it is impossible  to live without <<them nowadays**T8**vocab**None**4**them>>.',
  'To start with, different <<kinds**T9**Noun_number**None**4**kind>> of vehicles produce <<a variety**T10**Articles**None**11**the variety>> of gases that lead to death but we can not travel without them.',
  'For example, it is evident that cars produce ga

Датасет:

Слово - Папка - Filename - Предложение - Предложение с маской на месте слова - CEFR Level - Употреблено ли вместо этого слово какое-то другое ошибочно (ошибки типа lex_item_choice)

In [32]:
dataset_variants = pd.read_csv(
    "dataset_lexics_final3.csv",
    sep=';',
    index_col="Unnamed: 0"
)
dataset_variantas = dataset_variants.loc[dataset_variants["Delete"]!=1.0]

In [33]:
dataset_variants

Unnamed: 0_level_0,sent_id,target,variant,correction,masked_sent,variant_count,correction_count,error_type,target_true,Delete,File,Folder,Revisited1,Filename
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1167,88.0,1.0,understandings,perceptions,"To start with, happiness is a feeling of comf...",1,1,lex_item_choice,1.0,0.0,2014_EZa_13_2,exam/Exam2014,1.0,exam/Exam2014/2014_EZa_13_2
1173,93.0,1.0,strictly,absolutely,I know that she was healthy and that her birt...,2,5,lex_item_choice,1.0,0.0,2014_EZa_13_2,exam/Exam2014,1.0,exam/Exam2014/2014_EZa_13_2
1174,93.0,0.0,completely,absolutely,I know that she was healthy and that her birt...,2,5,lex_item_choice,0.0,0.0,2014_EZa_13_2,exam/Exam2014,1.0,exam/Exam2014/2014_EZa_13_2
1187,93.0,0.0,definitely,absolutely,I know that she was healthy and that her birt...,1,5,lex_item_choice,1.0,0.0,2014_EZa_13_2,exam/Exam2014,1.0,exam/Exam2014/2014_EZa_13_2
2051,146.0,1.0,needs,requires,It helps to use all resources more effectivel...,2,3,lex_item_choice,1.0,1.0,2017_NMya_85_2,exam/Exam2017/NMya_1-108,1.0,exam/Exam2017/NMya_1-108/2017_NMya_85_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3107289,167646.0,0.0,appear,result,"Comparing the EU and Latin America, the EU ha...",1,3,lex_item_choice,,0.0,2017_ABl_20_1,exam/Exam2017/ABl,,exam/Exam2017/ABl/2017_ABl_20_1
3107290,167646.0,0.0,time,result,"Comparing the EU and Latin America, the EU ha...",1,3,lex_item_choice,,0.0,2017_ABl_20_1,exam/Exam2017/ABl,,exam/Exam2017/ABl/2017_ABl_20_1
3107597,167664.0,0.0,grow,raise,"Secondly, knowing that they have a huge socia...",2,35,lex_item_choice,,0.0,2020_MLa_2063_2,exam/Exam2020/Task_3_Essays_1897_2839,,exam/Exam2020/Task_3_Essays_1897_2839/2020_MLa...
3107600,167664.0,0.0,improve,raise,"Secondly, knowing that they have a huge socia...",1,35,lex_item_choice,,0.0,2020_MLa_2063_2,exam/Exam2020/Task_3_Essays_1897_2839,,exam/Exam2020/Task_3_Essays_1897_2839/2020_MLa...


In [34]:
words = dataset_variants["correction"].unique()

In [35]:
dataset_out = []
processed_texts = [
    (text_id, list(enumerate(text)), meta) for text_id, text, meta in processed_texts
]

In [36]:
for word in tqdm_notebook(words, total=len(words)):
    for text_id, text, meta in processed_texts:
        for sent_id, sent in text:
            pattern = re.compile(
                f"<<{word}\*\*(T[0-9]+)\*\*lex_item_choice.*?>>",
                re.DOTALL
            )
            match = re.search(pattern, sent)
            if match:
                dataset_out.append({
                    "word": word,
                    "index": match.group(1),
                    "folder": meta.get("folder"),
                    "fielname": meta.get("filename"),
                    "sent": sent,
                    "sent_id": sent_id,
                    "CEFR_level": meta.get("CEFR_level"),
                    "target": 1
                })
            else:
                sent_clear = re.sub(
                    "<<.*?>>",
                    '',
                    sent
                )
                match = re.search(f"(?<!\w){word}(?!\w)", sent_clear)
                if match:
                    dataset_out.append({
                    "word": word,
                    "span_clear": match.span(),
                    "folder": meta.get("folder"),
                    "fielname": meta.get("filename"),
                    "sent": sent,
                    "sent_id": sent_id,
                    "CEFR_level": meta.get("CEFR_level"),
                    "target": 0
                })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for word in tqdm_notebook(words, total=len(words)):


  0%|          | 0/1032 [00:00<?, ?it/s]

In [37]:
df_out = pd.DataFrame(dataset_out)

In [38]:
df_out["target"].value_counts()

0    1034829
1       6120
Name: target, dtype: int64

In [39]:
df_out["target"].value_counts()/len(df_out)

0    0.994121
1    0.005879
Name: target, dtype: float64

In [40]:
df_out

Unnamed: 0,word,index,folder,fielname,sent,sent_id,CEFR_level,target,span_clear
0,perceptions,T4,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_EZa_13_2,"<<However,**T3**Punctuation**None**7**However>...",3,B2+,1,
1,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Exam...,2020_MLa_5542_2,"First and foremost, the ideas of <<capitalism,...",4,,0,"(139, 149)"
2,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_DZu_31_2,"Moreover, I can say with confidence that durin...",9,B1+,0,"(78, 88)"
3,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_DZu_31_2,I am absolutely sure that people we meet in ou...,13,B1+,0,"(5, 15)"
4,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Exam...,2020_MLa_5234_2,"On the other hand, pharmaceutical companies, d...",6,,0,"(139, 149)"
...,...,...,...,...,...,...,...,...,...
1040944,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2016_EKu_125_2,"When people hear about a cruel crime, everybod...",0,B1+,0,"(25, 30)"
1040945,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2016_EKu_125_2,"As for me, I find long prison sentences too cr...",10,B1+,0,"(44, 49)"
1040946,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Exam...,2020_MLa_5833_2,"That is why, <<nowadays,**T28**punct**None**7*...",8,,0,"(50, 55)"
1040947,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2017_EGe_83_2,"Moreover, children <<play**T22**lex_item_choic...",8,,0,"(55, 60)"


In [41]:
df_out = df_out[~df_out["CEFR_level"].isna()]

In [42]:
df_out["CEFR_level"].value_counts()

       460341
B1+     87963
B2+     59654
B1      33785
C1      29259
B2      25837
C1+     10997
B1-      5878
A2        335
Name: CEFR_level, dtype: int64

In [43]:
df_out = df_out.loc[df_out["CEFR_level"]!='']

In [44]:
df_out["target"].value_counts()/len(df_out)

0    0.984384
1    0.015616
Name: target, dtype: float64

In [45]:
df_out["CEFR_level"].value_counts()/len(df_out)

B1+    0.346710
B2+    0.235129
B1     0.133165
C1     0.115325
B2     0.101838
C1+    0.043345
B1-    0.023168
A2     0.001320
Name: CEFR_level, dtype: float64

In [46]:
df_out = df_out.loc[
    df_out["CEFR_level"].apply(
        lambda x: x in ["B1","B1+","B2","B2+","C1","C1+"]
    )
]

In [47]:
df_out

Unnamed: 0,word,index,folder,fielname,sent,sent_id,CEFR_level,target,span_clear
0,perceptions,T4,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_EZa_13_2,"<<However,**T3**Punctuation**None**7**However>...",3,B2+,1,
2,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_DZu_31_2,"Moreover, I can say with confidence that durin...",9,B1+,0,"(78, 88)"
3,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_DZu_31_2,I am absolutely sure that people we meet in ou...,13,B1+,0,"(5, 15)"
11,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2017_MPa_62_2,"To conclude, I firmly believe that there are c...",11,C1,0,"(76, 86)"
12,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_ASt_35_2,"Summing up, I absolutely do not agree that <<...",14,B2+,0,"(15, 25)"
...,...,...,...,...,...,...,...,...,...
1040939,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2017_KTR_1_2,The second opinion is that <<cheating**T29**Pa...,6,B2,0,"(54, 59)"
1040940,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2017_EGe_25_2,The world is a cruel place and it have never h...,3,B1,0,"(15, 20)"
1040942,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2016_JSl_193_2,"In conclusion, I'd like to say that law breake...",15,B1,0,"(78, 83)"
1040944,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2016_EKu_125_2,"When people hear about a cruel crime, everybod...",0,B1+,0,"(25, 30)"


In [48]:
df_out["sent_clear"] = df_out["sent"].apply(
    lambda x: re.sub(
        "<<(.*?)\*\*.*?\*\*.*?\*\*.*?\*\*.*?\*\*.*?>>",
        r"\1",
        x
    )
)

In [49]:
word2vec = KeyedVectors.load_word2vec_format(
    "gensim_models/skipgram_wikipedia_no_lemma/model.txt"
)

In [50]:
tqdm.pandas()

In [51]:
df_out["word_vector"] = df_out["word"].apply(
    lambda x: word2vec[x]
)

In [52]:
df_out

Unnamed: 0,word,index,folder,fielname,sent,sent_id,CEFR_level,target,span_clear,sent_clear,word_vector
0,perceptions,T4,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_EZa_13_2,"<<However,**T3**Punctuation**None**7**However>...",3,B2+,1,,"However, all people are different and they hav...","[0.3426203, 0.22680311, 0.14678335, 0.01468511..."
2,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_DZu_31_2,"Moreover, I can say with confidence that durin...",9,B1+,0,"(78, 88)","Moreover, I can say with confidence that durin...","[-0.19043966, 0.28531945, -0.024855996, -0.191..."
3,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_DZu_31_2,I am absolutely sure that people we meet in ou...,13,B1+,0,"(5, 15)",I am absolutely sure that people we meet in ou...,"[-0.19043966, 0.28531945, -0.024855996, -0.191..."
11,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2017_MPa_62_2,"To conclude, I firmly believe that there are c...",11,C1,0,"(76, 86)","To conclude, I firmly believe that there are c...","[-0.19043966, 0.28531945, -0.024855996, -0.191..."
12,absolutely,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2014_ASt_35_2,"Summing up, I absolutely do not agree that <<...",14,B2+,0,"(15, 25)","Summing up, I absolutely do not agree that ou...","[-0.19043966, 0.28531945, -0.024855996, -0.191..."
...,...,...,...,...,...,...,...,...,...,...,...
1040939,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2017_KTR_1_2,The second opinion is that <<cheating**T29**Pa...,6,B2,0,"(54, 59)",The second opinion is that cheating in profess...,"[0.19856142, 0.5089491, -0.1001941, 0.40547368..."
1040940,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2017_EGe_25_2,The world is a cruel place and it have never h...,3,B1,0,"(15, 20)",The world is a cruel place and it have never h...,"[0.19856142, 0.5089491, -0.1001941, 0.40547368..."
1040942,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2016_JSl_193_2,"In conclusion, I'd like to say that law breake...",15,B1,0,"(78, 83)","In conclusion, I'd like to say that law breake...","[0.19856142, 0.5089491, -0.1001941, 0.40547368..."
1040944,cruel,,downloaded_2022_05_03_13_27_13286380/exam/Old_...,2016_EKu_125_2,"When people hear about a cruel crime, everybod...",0,B1+,0,"(25, 30)","When people hear about a cruel crime, everybod...","[0.19856142, 0.5089491, -0.1001941, 0.40547368..."


In [53]:
df_stats = []

for word in tqdm_notebook(df_out["word"].unique(), total=len(df_out["word"].unique())):
    df_stats.append({
        "word": word,
        "0": df_out.loc[df_out["word"]==word]["target"].value_counts().get(0),
        "1": df_out.loc[df_out["word"]==word]["target"].value_counts().get(1),
        "count": len(df_out.loc[df_out["word"]==word]["target"])
    })

df_stats = pd.DataFrame(df_stats)

df_stats["ratio"] = df_stats["1"]/df_stats["count"]

good_words = df_stats[df_stats["ratio"] > 0.05]["word"].tolist()

df_out = df_out.loc[
    df_out["word"].apply(lambda x: x in good_words)
]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for word in tqdm_notebook(df_out["word"].unique(), total=len(df_out["word"].unique())):


  0%|          | 0/1025 [00:00<?, ?it/s]

In [101]:
df_out.columns

Index(['word', 'index', 'folder', 'fielname', 'sent', 'sent_id', 'CEFR_level',
       'target', 'span_clear', 'sent_clear', 'word_vector'],
      dtype='object')

In [111]:
df_out["sent_uid"] = df_out.apply(
    lambda x: f"{x['folder']}/{x['fielname']}/{sent_id}",
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out["sent_uid"] = df_out.apply(


In [112]:
sent_id_train, sent_id_test = train_test_split(
    df_out.sent_uid.unique(),
    random_state=1138,
    test_size=0.2
)

In [113]:
index_train = df_out.loc[df_out["sent_uid"].isin(sent_id_train)].index
index_test = df_out.loc[df_out["sent_uid"].isin(sent_id_test)].index

In [114]:
len(index_train)

13365

In [115]:
len(index_test)

3329

In [121]:
len(sent_id_test)

843

In [122]:
tfidf = TfidfVectorizer().fit(df_out.loc[index_train]["sent_clear"])

In [123]:
len(df_out)

16694

In [124]:
df_out["target"].value_counts()/len(df_out)

0    0.913142
1    0.086858
Name: target, dtype: float64

In [125]:
tfidf_train = tfidf.transform(df_out.loc[index_train]["sent_clear"]).toarray()
tfidf_test = tfidf.transform(df_out.loc[index_test]["sent_clear"]).toarray()

In [126]:
df_stats.to_csv("df_out_error_prediction_value_counts.csv", sep=';')

In [127]:
w2v_train = np.array(df_out.loc[index_train]["word_vector"].tolist())
w2v_test = np.array(df_out.loc[index_test]["word_vector"].tolist())

In [128]:
enc = OneHotEncoder().fit(df_out[["CEFR_level"]])
cefr_train  = enc.transform(df_out.loc[index_train][["CEFR_level"]]).toarray()
cefr_test = enc.transform(df_out.loc[index_test][["CEFR_level"]]).toarray()

In [129]:
w2v_train.shape, tfidf_train.shape, cefr_train.shape

((13365, 300), (13365, 6865), (13365, 6))

In [130]:
X_train = np.hstack([w2v_train, tfidf_train, cefr_train])
X_test = np.hstack([w2v_test, tfidf_test, cefr_test])
y_train = df_out.loc[index_train]["target"]
y_test = df_out.loc[index_test]["target"]

In [132]:
X_train.shape, X_test.shape

((13365, 7171), (3329, 7171))

In [133]:
pd.Series(y_train).value_counts()

0    12206
1     1159
Name: target, dtype: int64

In [134]:
pd.Series(y_test).value_counts()

0    3038
1     291
Name: target, dtype: int64

In [135]:
from telegram.ext import Updater

In [138]:
classifiers = [
    KNeighborsClassifier(n_neighbors=1),
    KNeighborsClassifier(n_neighbors=2),
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=4),
    KNeighborsClassifier(n_neighbors=5),
    LinearSVC(random_state=42),
    DecisionTreeClassifier(random_state=42),
    LogisticRegression(random_state=42),
    RidgeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    GaussianNB()
]

names = [type(clf).__name__ for clf in classifiers]

result_df = []

for name, clf in tqdm_notebook(zip(names, classifiers), total=len(names)):
    clf.fit(X_train, y_train)

    y_train_pred = np.round(clf.predict(X_train))
    y_test_pred = np.round(clf.predict(X_test))

    train_f1 = f1_score(y_train, y_train_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    train_precision = precision_score(y_train, y_train_pred)
    test_precision = precision_score(y_test, y_test_pred)

    train_recall = recall_score(y_train, y_train_pred)
    test_recall = recall_score(y_test, y_test_pred)

    result_df.append({
        "name": name,
        "test_f1": test_f1,
        "train_f1": train_f1,
        "test_acc": test_acc,
        "train_acc": train_acc,
        "test_precision": test_precision,
        "train_precision": train_precision,
        "test_recall": test_recall,
        "train_recall": train_recall
    })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for name, clf in tqdm_notebook(zip(names, classifiers), total=len(names)):


  0%|          | 0/13 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [139]:
result_df = pd.DataFrame(result_df)
result_df

Unnamed: 0,name,test_f1,train_f1,test_acc,train_acc,test_precision,train_precision,test_recall,train_recall
0,KNeighborsClassifier,0.199081,1.0,0.842896,1.0,0.179558,1.0,0.223368,1.0
1,KNeighborsClassifier,0.130081,0.420981,0.903575,0.936401,0.307692,1.0,0.082474,0.266609
2,KNeighborsClassifier,0.164384,0.491633,0.890057,0.934082,0.244898,0.74216,0.123711,0.367558
3,KNeighborsClassifier,0.104348,0.226048,0.907179,0.922634,0.333333,0.853107,0.061856,0.130285
4,KNeighborsClassifier,0.115702,0.280193,0.903575,0.92196,0.291667,0.7,0.072165,0.175151
5,LinearSVC,0.089552,0.361851,0.908381,0.931912,0.340909,0.966292,0.051546,0.222606
6,DecisionTreeClassifier,0.192496,1.0,0.851307,1.0,0.18323,1.0,0.202749,1.0
7,LogisticRegression,0.032895,0.07597,0.911685,0.916274,0.384615,0.884615,0.017182,0.039689
8,RidgeClassifier,0.02623,0.160757,0.910784,0.920314,0.285714,0.927273,0.013746,0.088007
9,RandomForestClassifier,0.152439,0.999136,0.916491,0.99985,0.675676,1.0,0.085911,0.998274


In [140]:
pd.Series(y_test).value_counts() / len(y_test)

0    0.912586
1    0.087414
Name: target, dtype: float64

In [141]:
pd.Series(y_train).value_counts() / len(y_train)

0    0.913281
1    0.086719
Name: target, dtype: float64

In [142]:
len(df_out) == len(y_train) + len(y_test)

True

In [143]:
best_clf = DecisionTreeClassifier(random_state=42)
best_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [144]:
weights = pd.Series({
    i: best_clf.feature_importances_[i] for i in range(X_train.shape[1])
})

In [145]:
weights[weights!=0.0].sort_values()

1489    0.000156
3838    0.000157
39      0.000157
4266    0.000187
3363    0.000187
          ...   
969     0.006403
5614    0.007431
156     0.010923
3486    0.011788
6462    0.012896
Length: 851, dtype: float64

In [146]:
vocab = {val:key for key,val in tfidf.vocabulary_.items()}

In [147]:
vocab[6430]

'understandably'

In [148]:
processed_lens = pd.Series([len(word_tokenize(sent)) for sent in processed_sents])
processed_counts = pd.Series([sent.count("<<") for sent in processed_sents])

In [149]:
processed_lens.describe()

count    231029.000000
mean         48.529825
std          34.628109
min           1.000000
25%          22.000000
50%          40.000000
75%          65.000000
max         461.000000
dtype: float64

In [150]:
processed_counts.describe()

count    231029.000000
mean          1.354644
std           1.520121
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          19.000000
dtype: float64

In [151]:
result_df.to_excel(
    "TableI.xlsx",
    float_format="%.4f"
)