In [13]:
import numpy as np
import pandas as pd
from scipy.stats import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, RFE, mutual_info_classif
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [14]:
df = pd.read_csv('SMS.tsv', sep='\t')
df.head(5)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df['type'] = pd.factorize(df['class'])[0]  # ham - 0, spam - 1
df = df.drop('class', axis=1)
df.head(5)

Unnamed: 0,text,type
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [16]:
y = df['type']

In [17]:
counted = CountVectorizer()
X = counted.fit_transform(df['text'])

In [18]:
X.shape

(5572, 8713)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [20]:
counted.get_feature_names_out()

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [21]:
feature_names_inline = counted.get_feature_names_out()


def print_best_features(features, values=None):
    if values is not None:
        for feature_num in features[:num_of_features]:
            print(f'{feature_names_inline[feature_num]}, {feature_num}, Value: {values[feature_num]}.')
    else:
        for feature_num in features[:num_of_features]:
            print(f'{feature_names_inline[feature_num]}, {feature_num}.')

In [22]:
num_of_features = 30

In [23]:
def gen_model(model):
    print('Встроенная')
    model.fit(X, y)
    values = model.feature_importances_
    features = np.argsort(values)[::-1][:num_of_features]
    count = 0
    for feature_num in features[:num_of_features]:
        print(f' Word: {feature_names_inline[feature_num]},'
              f' Weight: {values[feature_num]}.'
              f' Count: {count + 1}')
        count += 1

    features_2 = []
    n = X.shape[1]
    while len(features_2) < num_of_features:
        print(f'Count {len(features_2) + 1}/{num_of_features} start')
        best_feature = None
        best_score = -1
        for feature in range(n):
            features_2.append(feature)
            model.fit(X_train[:, features_2], y_train)
            score = model.score(X_test[:, features_2], y_test)
            if score >= best_score:
                best_feature = feature
                best_score = score
            features_2.remove(feature)
        features_2.append(best_feature)
        print(f'Count {len(features_2)}/{num_of_features} finish Feature: {feature_names_inline[best_feature]}')

    for feature_num in features_2[:num_of_features]:
        print(f' Word: {feature_names_inline[feature_num]},'
              f' Weight: {values[feature_num]}.'
              f' Count: {count + 1}')
        count += 1

    X_train_filter = X_train
    corr = X_train_filter.corr(method='pearson')
    corr_class = corr['class']
    coef_filt = corr_class.sort_values()
    features_3 = np.array(coef_filt.index[-30:-1])[::-1]

    for feature_num in features_3[:num_of_features]:
        print(f' Word: {feature_names_inline[feature_num]},'
                f' Weight: {values[feature_num]}.'
                f' Count: {count + 1}')
        count += 1
    return (values, features, features_2, features_3)

In [None]:
values_Dec_My, features_v, features_w, features_f = gen_model(DecisionTreeClassifier())

Встроенная
 Word: call, Weight: 0.2016224717451438. Count: 1
 Word: txt, Weight: 0.1727413859775589. Count: 2
 Word: www, Weight: 0.06789969064320896. Count: 3
 Word: me, Weight: 0.052485333879488796. Count: 4
 Word: ll, Weight: 0.0418055233322104. Count: 5
 Word: free, Weight: 0.040225369819395874. Count: 6
 Word: claim, Weight: 0.027602133812263455. Count: 7
 Word: 150p, Weight: 0.02737972476118965. Count: 8
 Word: uk, Weight: 0.019040462693204644. Count: 9
 Word: win, Weight: 0.013528227197013127. Count: 10
 Word: your, Weight: 0.012948930587105112. Count: 11
 Word: tones, Weight: 0.011824447159562502. Count: 12
 Word: my, Weight: 0.009643374592073959. Count: 13
 Word: stop, Weight: 0.009067872820020088. Count: 14
 Word: ringtone, Weight: 0.008957301899475262. Count: 15
 Word: unsubscribe, Weight: 0.008895156863552958. Count: 16
 Word: service, Weight: 0.00870107778786419. Count: 17
 Word: lt, Weight: 0.008388549237161282. Count: 18
 Word: get, Weight: 0.007773101083425215. Count: 1


KeyboardInterrupt



In [25]:
model = SelectKBest(score_func=chi2, k=num_of_features)

In [26]:
X_train_chi2 = X_train
select_chi2 = SelectKBest(chi2, k = 30)
select_chi2.fit_transform(X_train_chi2, y_train)
features_b_chi = select_chi2.get_support(indices=True)


TypeError: float() argument must be a string or a real number, not 'rv_continuous_frozen'

In [None]:
model = RFE(estimator=Ridge(alpha=1.0), n_features_to_select=num_of_features)
model.fit_transform(X_train, y_train)
features_b_rfe = model.get_support(indices=True)

In [None]:
model = SelectKBest(score_func=mutual_info_classif, k=num_of_features)
model.fit_transform(X_train, y_train)
features_b_mut = model.get_support(indices=True)

In [None]:
classifiers = [
    LogisticRegression(),
    SVC(),
    RandomForestClassifier()
]

def get_score_for_classifier(classifier):
    class_before = classifier
    class_before.fit(X_train, y_train)
    predict_before = class_before.predict(X_test)
    score_before = accuracy_score(y_test, predict_before)
    return score_before

classifiers_values = [
    get_score_for_classifier(i) for i in classifiers
]
features_values = []
features_values.clear()
features_values.append(features_v)
features_values.append(features_w)
features_values.append(features_f)
features_values.append(features_b_chi)
features_values.append(features_b_rfe)
features_values.append(features_b_mut)
def build_diff(feature_num):
    print(f'Difference for {features_methods[feature_num]}')
    for i, classifier in enumerate(classifiers):
        classifier_name = classifier.__class__.__name__

        score_before = classifiers_values[i]

        class_after = classifier
        class_after.fit(X_train[:, features_values[feature_num]], y_train)
        predict_after = class_after.predict(X_test[:, features_values[feature_num]])
        score_after = accuracy_score(y_test, predict_after)

        print(f'{classifier_name} before: {score_before}')
        print(f'{classifier_name} after: {score_after}')
        print(f'{classifier_name} difference: {score_after - score_before} \n')

In [None]:
features_methods = [
    'Встроенный',
    'Оберточный',
    'Фильтрующий',
    'RFE',
    'Фильтр с chi2',
    'Фильтр с mutual info',
]
for i in range(len(features_methods)):
    build_diff(i)