In [193]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [194]:
data = pd.read_csv('train_spam.csv')
x_test = pd.read_csv('test_spam.csv')

In [195]:
data.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [196]:
x, y = data['text'], data['text_type']

In [197]:
y[y == 'ham'] = 1
y[y == 'spam'] = 0

In [198]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=42)
y_train = y_train.astype('int')
y_val = y_val.astype('int')

In [199]:
data.isna().any()

text_type    False
text         False
dtype: bool

Пропусков нет

In [200]:
x_test.isna().sum()

text    0
dtype: int64

Распределение таргета. Видим, что ham намного больше, чем spam

In [201]:
y.value_counts()

text_type
1    11469
0     4809
Name: count, dtype: int64

In [202]:
# -- YOUR CODE HERE -- источник: семинар 11
cnt_vec = CountVectorizer()
X = cnt_vec.fit_transform(x_train)

In [203]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [204]:
X.toarray().shape

(11394, 43683)

In [205]:
x.shape

(16278,)

In [206]:
def contains_digit(s: str) -> bool:
    # Проверка, содержит ли слово цифры
    return bool(re.search(r'[0-9]', s))

def contains_punctuation(s: str) -> bool:
    # Проверка, содержит ли слово пунктуацию
    return any(x in punctuation for x in s)

def is_hashtag(s: str) -> bool:
    # Проверка, является ли слово хэштегом
    return s.startswith('#')

def is_mention(s: str) -> bool:
    # Проверка, является ли слово упоминанием
    return s.startswith('@')

def investigate_vocabulary(vocabulary):
    # -- YOUR CODE HERE --
    print('With digit:      ', len({k:v for k, v in vocabulary.items() if contains_digit(k)})
          )
    print('With punctuation:', len({k:v for k, v in vocabulary.items() if contains_punctuation(k)})
          )
    print('Hashtags:        ', len({k:v for k, v in vocabulary.items() if is_hashtag(k)})
          )
    print('Mentions:        ', len({k:v for k, v in vocabulary.items() if is_mention(k)})
          )




In [207]:
import re
from string import punctuation

In [208]:
investigate_vocabulary(cnt_vec.vocabulary_)

With digit:       4723
With punctuation: 0
Hashtags:         0
Mentions:         0


In [209]:
from nltk.tokenize import TweetTokenizer

In [210]:
cnt_tweet = CountVectorizer(tokenizer=TweetTokenizer().tokenize)
X_tweet = cnt_tweet.fit_transform(x_train)
X_tweet.shape



(11394, 44557)

In [211]:
investigate_vocabulary(cnt_tweet.vocabulary_
                       )

With digit:       4150
With punctuation: 406
Hashtags:         0
Mentions:         338


In [212]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [343]:
def contains_only_latin_letters(s: str) -> bool:
    # Проверка, содержит ли слово только латинские буквы
    return not bool(re.search(r'[^a-zA-Z]', s))

def is_emoji(s: str) -> bool:
    # Проверка, является ли слово смайликом
    return (all(x in punctuation for x in s) and any([j in ['(', ')', '[', ']', ] for j in s]))

def is_hashtag(s: str) -> bool:
    # Проверка, является ли слово хэштегом
    return ((s.startswith('#') and contains_only_latin_letters(s[1:])) or (str(s.encode('UTF8'))[2:].startswith("\\")))

def custom_tokenizer(s: str):
    # Кастомный токенайзер
    s = s.lower() #0
    tokenised_s = TweetTokenizer().tokenize(s) #1
    cleaned_s = list(filter(lambda i: (contains_only_latin_letters(i) or is_emoji(i) or is_hashtag(i)), tokenised_s)) #2
    cleaned_s = [x for x in cleaned_s if x not in stopwords.words('english')] #3
    stemmer = SnowballStemmer("english") # из материалов семинара 11
    text_stemmed = [stemmer.stem(w) for w in cleaned_s] #4
    return text_stemmed

In [344]:
custom_tokenizer('She LOVES painting :-) #art \dc')

['love', 'paint', ':-)', '#art', '\\', 'dc']

In [345]:

print(str('\d'.encode('UTF8')))
print(str('🔥'.encode('UTF8'))[2:].startswith('\\'))

b'\\d'
True


In [346]:
for i in x_train[:10]:
    print(custom_tokenizer(i))
    print(i, '\n')

['advertis', 'uk', 'chief', 'come', 'seem', 'peopl', 'use', 'phone', 'text', 'messag', 'e', 'teenag', 'hi', 'beach', 'met', 'awesom', 'guy', 'pic', 'tom']
these are being advertised all over the uk chief come on seems to be to the same people who use phones for text messaging i e teenagers hi we re at the beach and i met this awesome guy here s his pic tom 

['tamil', 'music', 'channel']
tamil music channel 

['dont', 'cancer', 'mom', 'make', 'big', 'deal', 'regular', 'checkup', 'aka', 'pap', 'smear']
no i dont have cancer moms making a big deal out of a regular checkup aka pap smear 

['job', 'descript', 'good', 'afternoon', 'job', 'descript', 'comput', 'financ', 'student', 'prefer', 'interview', 'date', 'short', 'pleas', 'let', 'us', 'know', 'question', 'happi', 'thanksgiv', 'kevin', 'kindal']
job description good afternoon here is the job description for the computational finance students we should have our preferred interview dates shortly please let us know if you have any questio

In [347]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

bow = vectorizer.fit_transform(x_train)
bow_val = vectorizer.transform(x_val)



# log reg


In [348]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(bow, y_train)

In [349]:
from sklearn.metrics import roc_auc_score
prediction = clf.predict(bow_val)
print('ROC-AUC-score:', roc_auc_score(y_val, prediction))

ROC-AUC-score: 0.9222418394454168


# SVM

In [350]:

from sklearn.svm import SVC
svm_rbf = SVC(kernel="rbf", probability=True)
svm_rbf.fit(bow, y_train)
svm_rbf_y_pred = svm_rbf.predict(bow_val)
print('ROC-AUC-score for svm-rbf:', roc_auc_score(y_val, svm_rbf_y_pred))


ROC-AUC-score for svm-rbf: 0.8981589588369057


In [351]:
svm_s = SVC(kernel="sigmoid", probability=True)
svm_s.fit(bow, y_train)
svm_s_y_pred = svm_s.predict(bow_val)
print('ROC-AUC-score for svm-sigmoid:', roc_auc_score(y_val, svm_s_y_pred))

ROC-AUC-score for svm-sigmoid: 0.8856691751935097


# KNN

In [352]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(bow, y_train)
knn_y_pred = knn.predict(bow_val)
print('ROC-AUC-score for knn:', roc_auc_score(y_val, knn_y_pred))

ROC-AUC-score for knn: 0.7055228166149925


# RF

In [353]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(bow, y_train)
rf_y_pred = rf.predict(bow_val)
print('ROC-AUC-score for rf:', roc_auc_score(y_val, rf_y_pred))

ROC-AUC-score for rf: 0.8878523811910609


In [354]:
bow_test = vectorizer.transform(x_test.squeeze())

In [355]:

preds_test = clf.predict(bow_test)

In [356]:
import numpy as np
np.unique(preds_test, return_counts=True)

(array([0, 1]), array([1107, 2963]))

In [357]:
res = pd.DataFrame({'text': x_test.squeeze(), 'score': preds_test})

In [358]:
res.score[res.score == 1] = 'ham'
res.score[res.score == 0] = 'spam'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.score[res.score == 1] = 'ham'


In [359]:
res.head()

Unnamed: 0,text,score
0,j jim whitehead ejw cse ucsc edu writes j you ...,ham
1,original message from bitbitch magnesium net p...,ham
2,java for managers vince durasoft who just taug...,ham
3,there is a youtuber name saiman says,ham
4,underpriced issue with high return on equity t...,spam


In [360]:
res.to_csv('result.csv')

# updated: metric is much higher with detected emodji in vocab!