# Filtering based on simple rules

In [18]:
import pandas as pd
from ast import literal_eval
# Загружаем предобработанный файл:
folder='../data/'
file_name='text_preprocessed.csv'
df=pd.read_csv(folder+file_name, sep="|", encoding ='utf-8', index_col='Unnamed: 0')
df.unigrams = df.unigrams.apply(literal_eval)
df.bigrams = df.bigrams.apply(literal_eval)
print ('text posts with at least one bigram ', df.shape[0])
df.head()

text posts with at least one bigram  129437


Unnamed: 0,date,text,link_title,link_description,unigrams,bigrams
0,2014-11-20,Бузык барин булдирген,,,"[бузык, барин, булдирген]","[бузык_барин, барин_булдирген]"
1,2014-10-28,Алтын куз,,,"[алтын, куз]",[алтын_куз]
2,2014-07-14,Биз бакиттымыз,,,"[биз, бакиттымыза]",[биз_бакиттымыза]
3,2016-09-05,Молодцы!!!Красиво!!!,,,"[молодец, красиво]",[молодец_красиво]
4,2019-07-03,Самый Красавец Кот,,,"[самый, красавец, кот]","[самый_красавец, красавец_кот]"


In [19]:
df['Spam-index']=False

In [20]:
# Load dictionary to filter posts with these bigrams:
filter_folder='../dictionaries&instructions/'
filter_words_file='filter-words.txt'
with open(filter_folder+filter_words_file) as f:
    filter_list = f.read().splitlines()

In [21]:
# if bigram is in filter_list - set True value in boolean_list 
def FilterByBigrams(bigrams_lists, boolean_list, filter_list):
    for i, (bigrams, boolean) in enumerate(zip(bigrams_lists, boolean_list)):
        if boolean==False:
            if any(x in bigrams for x in filter_list):
                boolean_list[i]=True
    return boolean_list     

In [23]:
bigrams_lists=df.bigrams.values.tolist()
boolean_list_=df['Spam-index'].values.tolist()
Boolean_list=FilterByBigrams(bigrams_lists, boolean_list_, filter_list)
df['Spam-index']=Boolean_list
print ('Filtered spam-bigrams: ', Boolean_list.count(True))


Filtered spam-bigrams:  23306


In [24]:
# Load dictionary to filter posts with these bigrams if the post contains a link:
filter_words_file='filter-words_if_linked.txt'
with open(filter_folder+filter_words_file) as f:
    filter_list2 = f.read().splitlines()
    

In [25]:
def FilterByBigrams_links(bigrams_lists, boolean_list, filter_list, links_list):
    for i, (bigrams, boolean, links) in enumerate(zip(bigrams_lists, boolean_list, links_list)):
        if boolean==False:
            if any(x in bigrams for x in filter_list) & (links!=""):
                boolean_list[i]=True
    return boolean_list  

In [27]:
import numpy as np

df["link_title"] = df["link_title"].replace(np.nan, '')
boolean_list_=df['Spam-index'].values.tolist()
links_list=df['link_title'].values.tolist()
Boolean_list=FilterByBigrams_links(bigrams_lists, boolean_list_, filter_list2, links_list)
df['Spam-index']=Boolean_list
print ("Filtered together with spam-posts that contained links", Boolean_list.count(True))

Filtered together with spam-posts that contained links 24606


In [28]:
def FilterVKLinks(text_list, boolean_list):
    str1='vkontakte.ru'
    str2='vk.com'
    for i, (text, boolean) in enumerate(zip(text_list, boolean_list)):
        if boolean==False:
            if (str1 in text) | (str2 in text):
                boolean_list[i]=True
    return boolean_list  

In [30]:
text_list=df_with_bigrams.text.values.tolist()
boolean_list_=df['Spam-index'].values.tolist()
Boolean_list=FilterVKLinks(text_list, boolean_list_)
df['Spam-index']=Boolean_list
print ('Filtered together, with spam-posts with links to vk, vkontakte ', Boolean_list.count(True))


Filtered together, with spam-posts with links to vk, vkontakte  35562


In [33]:
# Find duplicates in not-spam posts. If the post duplicates more or equal to 5 times, then - spam:
df_=df.loc[df['Spam-index']==False].text.value_counts().rename_axis('spam_text').to_frame('counts')
df_spam=df_.loc[df_.counts>=5]
df_spam.reset_index(level=0, inplace=True)
print ("number of posts that duplicates more (or equal to) than 5:", df_spam.shape[0])
# !!! Save, but don't save again, because this file has been manually edited !!!!!
# df_spam.to_csv(filter_folder+'spam-posts.csv', sep='|', encoding='utf-8')


number of posts that duplicates more (or equal to) than 5: 306


In [36]:
# Review the file manually and delete posts that are not spam
# Upload file after manual removing of several non-spam posts
file_spam='spam-posts.csv'
df_spam=pd.read_csv(filter_folder+file_spam, sep="|", encoding ='utf-8', index_col=False)
df_spam.shape[0]

296

In [38]:
# Takes as input a list - the entire column of texts and the entire column of spam messages.
# 1 - if spam, 0 - not
def FilterSpam(text_list, spam_list, boolean_list):
    for i, (text, boolean) in enumerate(zip(text_list, boolean_list)):
        if boolean==False:
            if text in spam_list:
                boolean_list[i]=True
    return boolean_list   

text_list=df.text.values.tolist()
spam_list=df_spam.spam_text.values.tolist()
boolean_list=df['Spam-index'].values.tolist()

boolean_list=FilterSpam(text_list,spam_list, boolean_list)
df['Spam-index']=boolean_list

print ("! Total number of spam-posts:", df.loc[df['Spam-index']==1].shape[0])

! Total number of spam-posts: 38914


In [39]:
# Convert values from boolean True|False to integer 1/0, save:
df["Spam-index"] = df["Spam-index"].astype(int)

folder='../data/'
df.to_csv(folder+'text_preprocessed_Spamlabeled.csv', sep='|', encoding='utf-8')