In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("data/SMS.tsv",sep='\t')
df.sample(5)

Unnamed: 0,class,text
1535,ham,Remember on that day..
1448,ham,Don‘t give a flying monkeys wot they think and...
2969,ham,"Mostly sports type..lyk footbl,crckt.."
2997,ham,They released vday shirts and when u put it on...
2028,ham,No got new job at bar in airport on satsgettin...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Встроенный метод

In [8]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
features = vectorizer.get_feature_names_out()
X = X.toarray()
y =  df['class'].apply(lambda x: int(x =='spam'))
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: class, Length: 5572, dtype: int64

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm = SVC(gamma='auto', kernel='linear')
svm.fit(x_train, y_train)
coef = svm.coef_
coef

array([[ 0.59654183,  0.69805269, -0.02214067, ..., -0.05848638,
         0.22478353,  0.        ]])

In [10]:
coef.shape

(1, 8713)

In [11]:
idx = [x for _, x in sorted(zip(coef[0], range(0, coef.shape[1])))]
idx[-5:]

[352, 2067, 8596, 8016, 7986]

In [12]:
words = [features[i] for i in idx[-30:]]
words

['10p',
 'services',
 'co',
 'mob',
 'com',
 'urgent',
 'voicemail',
 'chat',
 'won',
 'dating',
 'prize',
 'order',
 '88066',
 'bid',
 '18',
 'tones',
 'ac',
 'call',
 '50',
 'stop',
 'text',
 'service',
 'mobile',
 'reply',
 'ringtone',
 '150p',
 'claim',
 'www',
 'uk',
 'txt']

## Метод фильтра

In [13]:
df1 = pd.DataFrame(x_train)

df1 = df1.assign(y=y_train)

corr = df1.corr(method='pearson')
print(corr.shape)

(8714, 8714)


In [21]:
corr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8704,8705,8706,8707,8708,8709,8710,8711,8712,y
0,1.0,-0.002847,-0.000633,-0.000896,-0.000633,-0.000633,-0.000633,-0.000633,,-0.001552,...,-0.000633,-0.000896,-0.000633,-0.000633,-0.000633,-0.000633,-0.000633,-0.000633,,0.015698
1,-0.002847,1.0,-0.001009,-0.001427,-0.001009,-0.001009,-0.001009,-0.001009,,0.457104,...,-0.001009,-0.001426,-0.001009,-0.001009,-0.001009,-0.001009,-0.001009,-0.001009,,0.007098
2,-0.000633,-0.001009,1.0,-0.000317,-0.000224,-0.000224,-0.000224,-0.000224,,-0.00055,...,-0.000224,-0.000317,-0.000224,-0.000224,-0.000224,-0.000224,-0.000224,-0.000224,,-0.006607
3,-0.000896,-0.001427,-0.000317,1.0,-0.000317,-0.000317,-0.000317,-0.000317,,-0.000777,...,-0.000317,-0.000449,-0.000317,-0.000317,-0.000317,-0.000317,-0.000317,-0.000317,,-0.009344
4,-0.000633,-0.001009,-0.000224,-0.000317,1.0,-0.000224,-0.000224,-0.000224,,-0.00055,...,-0.000224,-0.000317,-0.000224,-0.000224,-0.000224,-0.000224,-0.000224,-0.000224,,0.042418


In [38]:
cor_target = np.abs(corr["y"])
relevant_features = corr[cor_target>0.05]
idx_filter = list(relevant_features.index)
words_filter = [features[i] for i in idx_filter[:-1]]
words_filter

['accordingly',
 'brand',
 'everyone',
 'express',
 'face',
 'fails',
 'fish',
 'greet',
 'her',
 'irritates',
 'jerry',
 'lush',
 'rub',
 'sell',
 'shore',
 'site',
 'songs',
 'staff',
 'thank',
 'though',
 'tickets',
 'wee',
 'weight']

## Библиотечный метод

In [39]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
select = SelectKBest(chi2, k=30)
select.fit_transform(x_train, y_train)
idx_select = select.get_support(indices=True)
words_select = [features[i] for i in idx_select]
words_select

['100',
 '1000',
 '150p',
 '16',
 '18',
 '50',
 '500',
 'awarded',
 'call',
 'cash',
 'claim',
 'co',
 'contact',
 'cs',
 'free',
 'guaranteed',
 'mobile',
 'nokia',
 'prize',
 'reply',
 'ringtone',
 'service',
 'stop',
 'tone',
 'txt',
 'uk',
 'urgent',
 'win',
 'won',
 'www']