In [3]:
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('train_spam.csv')
df_test = pd.read_csv('test_spam.csv')

df.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [5]:
df.shape

(16278, 2)

–ó–∞–∫–æ–¥–∏—Ä—É–µ–º —Ü–µ–ª–µ–≤—É—é –ø–µ—Ä–µ–º–µ–Ω–Ω—É—é

In [6]:
df['text_type'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
df.loc[df['text_type'] == 'ham', 'text_type'] = 1
df.loc[df['text_type'] == 'spam', 'text_type'] = 0

In [8]:
df.head()

Unnamed: 0,text_type,text
0,1,make sure alex knows his birthday is over in f...
1,1,a resume for john lavorato thanks vince i will...
2,0,plzz visit my website moviesgodml to get all m...
3,0,urgent your mobile number has been awarded wit...
4,1,overview of hr associates analyst project per ...


In [9]:
print(f"text positive: {df['text_type'].sum()}")
print(f"text negative: {df.shape[0] - df['text_type'].sum()}")

text positive: 11469
text negative: 4809


–í –¥–∞—Ç–∞—Å–µ—Ç–µ –µ—Å—Ç—å –¥–∏—Å–±–∞–ª–∞–Ω—Å –∫–ª–∞—Å—Å–æ–≤. –î–ª—è —Ç–æ–≥–æ, —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å –∏—Å–∫–∞–∂–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤, –±—É–¥–µ–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å stratify –ø—Ä–∏ train_test_split –∏ –º–µ—Ç—Ä–∏–∫—É auc roc

In [10]:
df['text'][2]

'plzz visit my website moviesgodml to get all movies for free and also i provide direct download links no redirect and adsüòäüòäüòäüòäüòÅ'

In [11]:
print(f"text nan: {df['text'].isna().any()}")
print(f"text_type nan: {df['text_type'].isna().any()}")

text nan: False
text_type nan: False


–í —Ç–µ–∫—Å—Ç–µ –Ω–µ—Ç –ø—Ä–æ–ø—É—Å–∫–æ–≤

–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Å–∞–º—ã–µ —á–∞—Å—Ç—ã–µ —Å–ª–æ–≤–∞

In [12]:
from collections import Counter

text = ''.join(df['text']).lower()
d = Counter(text.split())

sorted_tokens = sorted(d.items(), key=lambda x: x[1], reverse=True)
print(len(d))
sorted_tokens[:10]

69567


[('the', 26800),
 ('to', 24790),
 ('and', 14857),
 ('i', 14522),
 ('a', 14000),
 ('you', 12871),
 ('1635465', 12529),
 ('of', 12174),
 ('in', 10111),
 ('for', 10003)]

–£–±–µ—Ä–µ–º –º—É—Å–æ—Ä –∏ –ø–æ—Å–º–æ—Ç—Ä–∏–º –µ—â–µ —Ä–∞–∑

In [13]:
import nltk
from nltk.corpus import stopwords

# -- YOUR CODE HERE --
nltk.download("stopwords", quiet=True)

filtered_tokens = [word for word in sorted_tokens if word[0] not in stopwords.words("english")]
print(len(filtered_tokens))
filtered_tokens[:10]

69416


[('1635465', 12529),
 ('ect', 5333),
 ('enron', 4753),
 ('vince', 4383),
 ('url', 2827),
 ('hou', 2658),
 ('kaminski', 2336),
 ('please', 2280),
 ('2000', 2233),
 ('com', 2213)]

–í —Ç–µ–∫—Å—Ç–µ –µ—Å—Ç—å —Ü–∏—Ñ—Ä—ã, –≤–æ–∑–º–æ–∂–Ω–æ, —á—Ç–æ —ç—Ç–æ –±—É–¥–µ—Ç –æ–¥–Ω–∏–º –∏–∑ –≤–∞–∂–Ω–µ–π—à–∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ —Å–ø–∞–º–∞ (–Ω–æ–º–µ—Ä –∫–∞—Ä—Ç—ã –¥–ª—è —Å–∫–∞–º-–ø–µ—Ä–µ–≤–æ–¥–∞ –∏ —Ç–ø)

–†–∞–∑–¥–µ–ª–∏–º —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ–≤—á–Ω—É—é –≤—ã–±–æ—Ä–∫—É

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['text_type'], test_size=0.3, stratify=df['text_type'])

In [15]:
y_train, y_test = y_train.astype('int'), y_test.astype('int')

In [16]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WhitespaceTokenizer

def custom_stem_tokenizer(text):
  stemmer = SnowballStemmer("english")
  text = text.lower()
  stop = set(punctuation).union(set(stopwords.words("english")))
  tokens = [
      stemmer.stem(word)
      for word in WhitespaceTokenizer().tokenize(text)
      #if word not in stop and
      #if (len(word) != 1 or ord(word) < 128)
  ]

  return tokens

In [17]:
vectorizer = CountVectorizer(tokenizer=custom_stem_tokenizer, min_df = 0.0005)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [18]:
X_train_vec.shape

(11394, 5762)

–í–æ—Å–ø–æ–ª—å–∑—É–µ–º—Å—è –ø—Ä–æ—Å—Ç–æ–π –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–µ–π

In [19]:
lr = LogisticRegression(max_iter=200)

lr.fit(X_train_vec, y_train)

y_pred = lr.predict(X_test_vec)

print(f"train accuracy: {accuracy_score(lr.predict(X_train_vec), y_train)}")
print(f"test accuracy: {accuracy_score(y_pred, y_test)}")

train accuracy: 0.9862208179743724
test accuracy: 0.95004095004095


In [20]:
from sklearn.metrics import roc_auc_score

print(f"train accuracy: {roc_auc_score(lr.predict(X_train_vec), y_train)}")
print(f"test accuracy: {roc_auc_score(y_pred, y_test)}")

train accuracy: 0.9879598840391645
test accuracy: 0.9489074164299789


–° –ø–µ—Ä–≤–æ–≥–æ —Ä–∞–∑–∞ –∏ —Ç–∞–∫–æ–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç. –í–ø–æ–ª–Ω–µ –Ω–µ–ø–ª–æ—Ö–æ, 94.5% –Ω–∞ —Ç–µ—Å—Ç–µ —ç—Ç–æ —Å–∏–ª—å–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏. –¢–µ–ø–µ—Ä—å —É –Ω–∞—Å –µ—Å—Ç—å –ø–µ—Ä–≤—ã–π –ø—Ä–µ—Ç–µ–Ω–¥–µ–Ω—Ç –≤ –ø–æ–±–µ–¥–∏—Ç–µ–ª–∏

–ü–æ–ø—Ä–æ–±—É–µ–º –¥—Ä—É–≥–∏–µ –º–æ–¥–µ–ª–∏

In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


# KNN
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_vec, y_train)

y_pred = knn.predict(X_test_vec)

print("knn:")
print(f"train accuracy: {roc_auc_score(knn.predict(X_train_vec), y_train)}")
print(f"test accuracy: {roc_auc_score(y_pred, y_test)}")
print("=============")

# SVM
svm = SVC(kernel='linear')
svm.fit(X_train_vec, y_train)

y_pred = svm.predict(X_test_vec)

print("svm:")
print(f"train accuracy: {roc_auc_score(svm.predict(X_train_vec), y_train)}")
print(f"test accuracy: {roc_auc_score(y_pred, y_test)}")
print("=============")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=15)
rf.fit(X_train_vec, y_train)

y_pred = rf.predict(X_test_vec)

print("random forest:")
print(f"train accuracy: {roc_auc_score(rf.predict(X_train_vec), y_train)}")
print(f"test accuracy: {roc_auc_score(y_pred, y_test)}")
print("=============")

knn:
train accuracy: 0.812492062844793
test accuracy: 0.7800861278125453
svm:
train accuracy: 0.9948135245962796
test accuracy: 0.9314072559939961
random forest:
train accuracy: 0.9027230504954896
test accuracy: 0.8912569463269179


In [39]:
from keras.models import Sequential
from keras.layers import Dense

X_train_nn = X_train_vec.toarray()
X_test_nn = X_test_vec.toarray()

model = Sequential()
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_nn, y_train, epochs=10, batch_size=32)

y_pred_test = model.predict(X_test_nn)
y_pred_train = model.predict(X_train_nn)

y_pred_test_binary = (y_pred_test > 0.5).astype(int)
y_pred_train_binary = (y_pred_train > 0.5).astype(int)

print("neural network:")
print(f"train accuracy: {roc_auc_score(y_pred_train_binary, y_train)}")
print(f"test accuracy: {roc_auc_score(y_pred_test_binary, y_test)}")
print("=============")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
neural network:
train accuracy: 0.9992911217567403
test accuracy: 0.9409653407984871


–•–æ—Ä–æ—à–æ —Å–µ–±—è –ø–æ–∫–∞–∑–∞–ª–∏ SVM –∏ –Ω–µ–π—Ä–æ–Ω–∫–∞, —É –æ—Å—Ç–∞–ª—å–Ω—ã—Ö –ø—Ä–∏ –ø–æ–¥–±–æ—Ä–µ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –Ω–∞ —Ç–µ—Å—Ç–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç —Å–∏–ª—å–Ω–æ –Ω–µ —É–≤–µ–ª–∏—á–∏–≤–∞–µ—Ç—Å—è. –ú–æ–∂–Ω–æ –±—ã–ª–æ –±—ã –µ—â–µ –¥–æ—Ç—è–Ω—É—Ç—å –ª–µ—Å, –Ω–æ –º–æ–∂–µ—Ç –±—ã—Ç—å –≤ –¥—Ä—É–≥–æ–π —Ä–∞–∑ :)

–õ—É—á—à–∞—è —Ç–æ—á–Ω–æ—Å—Ç—å –Ω–∞ —Ç–µ—Å—Ç–µ –ø–æ–ª—É—á–∏–ª–∞—Å—å 94.5% —É –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏. –ï—ë –∏ –±—É–¥–µ–º –æ–±—É—á–∞—Ç—å –¥–ª—è —Ç–µ—Å—Ç–æ–≤–æ–≥–æ —Ñ–∞–π–ª–∞, –ø–æ—Ç–æ–º –∑–∞–ø–∏—à–µ–º –≤ csv.

In [40]:
test = vectorizer.transform(df_test['text'])

In [41]:
y_test_values = lr.predict(test)

In [42]:
print(y_test_values.sum())
print(y_test_values.shape[0] - y_test_values.sum())

2971
1099


In [24]:
answer = pd.DataFrame(
    {
    'score': y_test_values,
    'text': df_test['text']
    }
)

In [25]:
answer.to_csv('answer', index=False)