# Загружаем данные

In [1]:
from pathlib import Path

data = []
for file in Path('.').glob('acllmdb/train/neg/*.txt'):
    if len(data) < 1500:
        with open(file, encoding='utf-8') as fp:
            data.append([fp.read(), 0])
            
for file in Path('.').glob('acllmdb/train/pos/*.txt'):
    if len(data) < 3000:
        with open(file, encoding='utf-8') as fp:
            data.append([fp.read(), 1])

In [2]:
len(data)

3000

In [3]:
import pandas as pd

df = pd.DataFrame(data, columns=['text', 'class'])

In [4]:
df['class'].value_counts()

0    1500
1    1500
Name: class, dtype: int64

# Предобработка

In [5]:
import nltk

In [6]:
#pip install contractions

In [7]:
import contractions
df['no_contract'] = df['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df.head()

Unnamed: 0,text,class,no_contract
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ..."


In [8]:
import re
df['no_num'] = df['no_contract'].apply(lambda x: [re.sub(r'\d+', '', word) for word in x])
df.head()

Unnamed: 0,text,class,no_contract,no_num
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ..."


In [9]:
df['text_1'] = [' '.join(map(str, l)) for l in df['no_num']]
df.head()

Unnamed: 0,text,class,no_contract,no_num,text_1
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...


In [10]:
from nltk.tokenize import word_tokenize
df['tokenized'] = df['text_1'].apply(word_tokenize)
df.head()

Unnamed: 0,text,class,no_contract,no_num,text_1,tokenized
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...,"[Story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...,"[Airport, ', starts, as, a, brand, new, luxury..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...,"[This, film, lacked, something, I, could, not,..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b...","[Sorry, everyone, ,, ,, ,, I, know, this, is, ..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...,"[When, I, was, little, my, parents, took, me, ..."


In [11]:
df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])
df.head()

Unnamed: 0,text,class,no_contract,no_num,text_1,tokenized,lower
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...,"[Story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...,"[Airport, ', starts, as, a, brand, new, luxury...","[airport, ', starts, as, a, brand, new, luxury..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...,"[This, film, lacked, something, I, could, not,...","[this, film, lacked, something, i, could, not,..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b...","[Sorry, everyone, ,, ,, ,, I, know, this, is, ...","[sorry, everyone, ,, ,, ,, i, know, this, is, ..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...,"[When, I, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ..."


In [12]:
import string

punc = string.punctuation
df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])
df.head()

Unnamed: 0,text,class,no_contract,no_num,text_1,tokenized,lower,no_punc
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...,"[Story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...,"[Airport, ', starts, as, a, brand, new, luxury...","[airport, ', starts, as, a, brand, new, luxury...","[airport, starts, as, a, brand, new, luxury, p..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...,"[This, film, lacked, something, I, could, not,...","[this, film, lacked, something, i, could, not,...","[this, film, lacked, something, i, could, not,..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b...","[Sorry, everyone, ,, ,, ,, I, know, this, is, ...","[sorry, everyone, ,, ,, ,, i, know, this, is, ...","[sorry, everyone, i, know, this, is, supposed,..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...,"[When, I, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ..."


In [13]:
from nltk.corpus import stopwords, wordnet
stop_words = set(stopwords.words('english'))
df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
df.head()


Unnamed: 0,text,class,no_contract,no_num,text_1,tokenized,lower,no_punc,stopwords_removed
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...,"[Story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, man, unnatural, feelings, pig, starts,..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...,"[Airport, ', starts, as, a, brand, new, luxury...","[airport, ', starts, as, a, brand, new, luxury...","[airport, starts, as, a, brand, new, luxury, p...","[airport, starts, brand, new, luxury, plane, l..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...,"[This, film, lacked, something, I, could, not,...","[this, film, lacked, something, i, could, not,...","[this, film, lacked, something, i, could, not,...","[film, lacked, something, could, put, finger, ..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b...","[Sorry, everyone, ,, ,, ,, I, know, this, is, ...","[sorry, everyone, ,, ,, ,, i, know, this, is, ...","[sorry, everyone, i, know, this, is, supposed,...","[sorry, everyone, know, supposed, ``, art, '',..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...,"[When, I, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[little, parents, took, along, theater, see, i..."


In [14]:
df['pos_tags'] = df['stopwords_removed'].apply(nltk.tag.pos_tag)
df.head()

Unnamed: 0,text,class,no_contract,no_num,text_1,tokenized,lower,no_punc,stopwords_removed,pos_tags
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...,"[Story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, man, unnatural, feelings, pig, starts,...","[(story, NN), (man, NN), (unnatural, JJ), (fee..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...,"[Airport, ', starts, as, a, brand, new, luxury...","[airport, ', starts, as, a, brand, new, luxury...","[airport, starts, as, a, brand, new, luxury, p...","[airport, starts, brand, new, luxury, plane, l...","[(airport, NN), (starts, NNS), (brand, NN), (n..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...,"[This, film, lacked, something, I, could, not,...","[this, film, lacked, something, i, could, not,...","[this, film, lacked, something, i, could, not,...","[film, lacked, something, could, put, finger, ...","[(film, NN), (lacked, VBD), (something, NN), (..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b...","[Sorry, everyone, ,, ,, ,, I, know, this, is, ...","[sorry, everyone, ,, ,, ,, i, know, this, is, ...","[sorry, everyone, i, know, this, is, supposed,...","[sorry, everyone, know, supposed, ``, art, '',...","[(sorry, JJ), (everyone, NN), (know, VBP), (su..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...,"[When, I, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[little, parents, took, along, theater, see, i...","[(little, JJ), (parents, NNS), (took, VBD), (a..."


In [15]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
df.head()

Unnamed: 0,text,class,no_contract,no_num,text_1,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...,"[Story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, man, unnatural, feelings, pig, starts,...","[(story, NN), (man, NN), (unnatural, JJ), (fee...","[(story, n), (man, n), (unnatural, a), (feelin..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...,"[Airport, ', starts, as, a, brand, new, luxury...","[airport, ', starts, as, a, brand, new, luxury...","[airport, starts, as, a, brand, new, luxury, p...","[airport, starts, brand, new, luxury, plane, l...","[(airport, NN), (starts, NNS), (brand, NN), (n...","[(airport, n), (starts, n), (brand, n), (new, ..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...,"[This, film, lacked, something, I, could, not,...","[this, film, lacked, something, i, could, not,...","[this, film, lacked, something, i, could, not,...","[film, lacked, something, could, put, finger, ...","[(film, NN), (lacked, VBD), (something, NN), (...","[(film, n), (lacked, v), (something, n), (coul..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b...","[Sorry, everyone, ,, ,, ,, I, know, this, is, ...","[sorry, everyone, ,, ,, ,, i, know, this, is, ...","[sorry, everyone, i, know, this, is, supposed,...","[sorry, everyone, know, supposed, ``, art, '',...","[(sorry, JJ), (everyone, NN), (know, VBP), (su...","[(sorry, a), (everyone, n), (know, v), (suppos..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...,"[When, I, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[little, parents, took, along, theater, see, i...","[(little, JJ), (parents, NNS), (took, VBD), (a...","[(little, a), (parents, n), (took, v), (along,..."


In [16]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
df['lemmatized'] = df['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df.head()

Unnamed: 0,text,class,no_contract,no_num,text_1,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,Story of a man who has unnatural feelings for ...,0,"[Story, of, a, man, who, has, unnatural, feeli...","[Story, of, a, man, who, has, unnatural, feeli...",Story of a man who has unnatural feelings for ...,"[Story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, of, a, man, who, has, unnatural, feeli...","[story, man, unnatural, feelings, pig, starts,...","[(story, NN), (man, NN), (unnatural, JJ), (fee...","[(story, n), (man, n), (unnatural, a), (feelin...","[story, man, unnatural, feeling, pig, start, o..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[Airport, '77, starts, as, a, brand, new, luxu...","[Airport, ', starts, as, a, brand, new, luxury...",Airport ' starts as a brand new luxury plane ...,"[Airport, ', starts, as, a, brand, new, luxury...","[airport, ', starts, as, a, brand, new, luxury...","[airport, starts, as, a, brand, new, luxury, p...","[airport, starts, brand, new, luxury, plane, l...","[(airport, NN), (starts, NNS), (brand, NN), (n...","[(airport, n), (starts, n), (brand, n), (new, ...","[airport, start, brand, new, luxury, plane, lo..."
2,This film lacked something I couldn't put my f...,0,"[This, film, lacked, something, I, could not, ...","[This, film, lacked, something, I, could not, ...",This film lacked something I could not put my ...,"[This, film, lacked, something, I, could, not,...","[this, film, lacked, something, i, could, not,...","[this, film, lacked, something, i, could, not,...","[film, lacked, something, could, put, finger, ...","[(film, NN), (lacked, VBD), (something, NN), (...","[(film, n), (lacked, v), (something, n), (coul...","[film, lack, something, could, put, finger, fi..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[Sorry, everyone,,,, I, know, this, is, suppos...","[Sorry, everyone,,,, I, know, this, is, suppos...","Sorry everyone,,, I know this is supposed to b...","[Sorry, everyone, ,, ,, ,, I, know, this, is, ...","[sorry, everyone, ,, ,, ,, i, know, this, is, ...","[sorry, everyone, i, know, this, is, supposed,...","[sorry, everyone, know, supposed, ``, art, '',...","[(sorry, JJ), (everyone, NN), (know, VBP), (su...","[(sorry, a), (everyone, n), (know, v), (suppos...","[sorry, everyone, know, suppose, ``, art, '', ..."
4,When I was little my parents took me along to ...,0,"[When, I, was, little, my, parents, took, me, ...","[When, I, was, little, my, parents, took, me, ...",When I was little my parents took me along to ...,"[When, I, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[when, i, was, little, my, parents, took, me, ...","[little, parents, took, along, theater, see, i...","[(little, JJ), (parents, NNS), (took, VBD), (a...","[(little, a), (parents, n), (took, v), (along,...","[little, parent, take, along, theater, see, in..."


In [17]:
df_1 = df[['lemmatized', 'class']]
df_1.rename(columns = {'lemmatized' : 'result'}, inplace = True)
df_1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1.rename(columns = {'lemmatized' : 'result'}, inplace = True)


Unnamed: 0,result,class
0,"[story, man, unnatural, feeling, pig, start, o...",0
1,"[airport, start, brand, new, luxury, plane, lo...",0
2,"[film, lack, something, could, put, finger, fi...",0
3,"[sorry, everyone, know, suppose, ``, art, '', ...",0
4,"[little, parent, take, along, theater, see, in...",0


# Векторизация

In [18]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

vector_size = 30
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_1['result'])]
model = Doc2Vec(documents, vector_size=vector_size, window=2, min_count=1, workers=4)

In [19]:
model.infer_vector(df_1.loc[0]['result'])

array([-0.01113792, -0.08168204,  0.14825669, -0.19907323,  0.22033738,
        0.1897854 , -0.29619887, -0.37313688, -0.60614175,  0.30058157,
        0.234031  ,  0.3681515 ,  0.11756665, -0.27440417,  0.00369337,
        0.12249249,  0.06872474,  0.02202839, -0.38790444,  0.4311176 ,
       -0.3439105 , -0.3779389 , -0.11271041,  0.47643355,  0.03491683,
        0.367841  , -0.06587069, -0.01946771,  0.09618762, -0.76097053],
      dtype=float32)

In [20]:
df_1.head()

Unnamed: 0,result,class
0,"[story, man, unnatural, feeling, pig, start, o...",0
1,"[airport, start, brand, new, luxury, plane, lo...",0
2,"[film, lack, something, could, put, finger, fi...",0
3,"[sorry, everyone, know, suppose, ``, art, '', ...",0
4,"[little, parent, take, along, theater, see, in...",0


In [21]:
%%capture --no-display
for i in range(1, vector_size + 1):
    df_1[f'emb_{i}'] = df_1['result'].apply(lambda x: model.infer_vector(x)[i-1])
df_1.head()

Unnamed: 0,result,class,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30
0,"[story, man, unnatural, feeling, pig, start, o...",0,0.052903,-0.042856,0.136128,-0.169464,0.282268,0.207394,-0.24308,-0.423726,...,-0.491658,-0.301945,-0.172599,0.464729,0.044957,0.386271,-0.0744,-0.041763,0.077998,-0.66741
1,"[airport, start, brand, new, luxury, plane, lo...",0,-0.550377,0.16432,-0.298253,-1.316703,0.256433,0.427372,-0.816421,-0.645806,...,-0.834922,-0.620754,-0.033492,1.157347,-0.56365,1.137599,-0.966359,0.122125,0.16821,-1.343877
2,"[film, lack, something, could, put, finger, fi...",0,-0.157742,-0.027752,0.055044,0.024517,0.08007,0.221612,-0.165902,-0.570111,...,-0.606625,-0.230382,-0.239302,0.321403,0.085499,0.240807,0.065992,-0.267528,0.140975,-0.731558
3,"[sorry, everyone, know, suppose, ``, art, '', ...",0,-0.101227,-0.232176,-0.04365,-0.27082,-0.05385,0.070679,-0.19792,-0.532273,...,-0.204319,-0.324138,0.098377,0.282462,-0.022211,0.253435,0.078741,0.063731,-0.058206,-0.786608
4,"[little, parent, take, along, theater, see, in...",0,0.230548,0.168775,0.040876,0.305132,0.62296,0.408655,-0.470982,-0.941381,...,-0.850418,-0.531263,-0.189062,0.520133,0.190786,1.179225,0.480946,-0.380185,0.082263,-1.842069


# Разделение на Train и Test

In [22]:
from sklearn.model_selection import train_test_split

X = df_1.drop(['result', 'class'], axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# SVM

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 

param_grid = { 
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    }

svm_model = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1)
svm_model.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [24]:
svm_model.best_estimator_

SVC()

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

pred = svm_model.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[208 108]
 [ 81 203]]
              precision    recall  f1-score   support

           0       0.72      0.66      0.69       316
           1       0.65      0.71      0.68       284

    accuracy                           0.69       600
   macro avg       0.69      0.69      0.68       600
weighted avg       0.69      0.69      0.69       600



# Linear Regression

In [26]:
from sklearn.linear_model import LinearRegression

lrm_model = LinearRegression()
lrm_model.fit(X_train, y_train)

pred = lrm_model.predict(X_test)


In [27]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred.round(), normalize=True)

0.69

# SGDRegressor и подбор параметров

In [28]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(penalty='l1', alpha=0.05, max_iter=10000)

In [29]:
sgd.fit(X_train, y_train)

SGDRegressor(alpha=0.05, max_iter=10000, penalty='l1')

In [42]:
pred = sgd.predict(X_test)

accuracy_score(y_test, pred.round())

0.47333333333333333

In [31]:
from sklearn.model_selection import GridSearchCV
parameters = {'penalty': ['l1', 'l2'], 'alpha': [0.01, 0.1, 0.5, 1], 'max_iter': [1000, 5000, 10000]}
sgd_1 = SGDRegressor(eta0=0.02)
gs_model = GridSearchCV(sgd_1, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
gs_model.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(eta0=0.02), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1],
                         'max_iter': [1000, 5000, 10000],
                         'penalty': ['l1', 'l2']},
             scoring='neg_mean_squared_error')

In [32]:
gs_model.best_estimator_

SGDRegressor(alpha=0.01, eta0=0.02)

In [34]:
gs_model.best_score_

-0.20913161664736082

In [33]:
pred = gs_model.predict(X_test)

accuracy_score(y_test, pred.round())

0.69