# Twitter Spam Classifier

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn import cross_validation, grid_search, metrics, neighbors, tree, naive_bayes, ensemble, svm, calibration, feature_selection, pipeline ,preprocessing

pd.set_option("display.max_columns",60)

## Dataset - Social honeypot icwsm 2011

In [None]:
#Спамеры
content_polluters = pd.read_table('./content_polluters.txt', header=None, names = [
    "UserID",
    "ProfileCreatedAt",
    "ProfileCollectedAt",
    "NumberOfFollowings",
    "NumberOfFollowers",
    "NumberOfTweets",
    "LengthOfScreenName",
    "LengthOfDesc"])
content_polluters_tweets = pd.read_table('./content_polluters_tweets.txt', header=None, names = [
    "UserID",
    "TweetID",
    "Tweet",
    "TweetCreatedAt"
])

In [None]:
# Пользователь 1:1 Твит (последний по дате публикации)
content_polluters = content_polluters.set_index("UserID")
content_polluters_tweets_distinct = content_polluters_tweets.sort_values('TweetCreatedAt', ascending = False).groupby(['UserID']).first()

In [None]:
#Легитимные пользователи
legitimate_users = pd.read_table('./legitimate_users.txt', header=None, names = [
    "UserID",
    "ProfileCreatedAt",
    "ProfileCollectedAt",
    "NumberOfFollowings",
    "NumberOfFollowers",
    "NumberOfTweets",
    "LengthOfScreenName",
    "LengthOfDesc"])
legitimate_users_tweets = pd.read_table('./legitimate_users_tweets.txt', header=None, names = [
    "UserID",
    "TweetID",
    "Tweet",
    "TweetCreatedAt"
])

In [None]:
# Пользователь 1:1 Твит (последний по дате публикации)
legitimate_users = legitimate_users.set_index("UserID")
legitimate_users_tweets_distinct = legitimate_users_tweets.sort_values('TweetCreatedAt', ascending = False).groupby(['UserID']).first()

In [None]:
# List of spamwords
with open("blacklist.txt", "r", encoding='utf-8') as f:
    spamwords=f.read().splitlines()

In [None]:
spammers = pd.concat([content_polluters, content_polluters_tweets_distinct], axis=1, join='inner')
spammers['IsSpammer'] = 1

nonspammers =  pd.concat([legitimate_users, legitimate_users_tweets_distinct], axis=1, join='inner')
nonspammers['IsSpammer'] = 0

data = pd.concat([spammers, nonspammers])

## Additional features extraction

In [None]:
#User features
data["ProfileCreatedAt"] = pd.to_datetime(data["ProfileCreatedAt"])
data["ProfileCollectedAt"] = pd.to_datetime(data["ProfileCollectedAt"])
data["AccountAge"] = (data["ProfileCollectedAt"] - data["ProfileCreatedAt"]).astype('timedelta64[h]')

data["Flwrs/Flwngs"] = data["NumberOfFollowers"].divide(data["NumberOfFollowings"], fill_value=True)
data["Flwrs/Flwngs"].replace(np.inf, data["NumberOfFollowers"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["Reputation"] = data["NumberOfFollowers"].divide((data["NumberOfFollowers"] + data["NumberOfFollowings"]), fill_value=True)
data["Reputation"].replace(np.inf, data["NumberOfFollowers"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["FlwngsIncrease"] = data["NumberOfFollowings"].divide(data["AccountAge"])
data["FlwngsIncrease"].replace(np.inf, data["NumberOfFollowings"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["TweetsPerDay"]  = data["NumberOfTweets"].divide((data["AccountAge"] / 24), fill_value=True)
data["TweetsPerDay"].replace(np.inf, data["NumberOfTweets"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["TweetsPerWeek"]  = data["NumberOfTweets"].divide((data["AccountAge"] / (24 * 7)), fill_value=True)
data["TweetsPerWeek"].replace(np.inf, data["NumberOfTweets"], inplace = True)
data.replace(np.nan, 0, inplace = True)

In [None]:
def clean_tweet(row):
    tweet = row["TweetWordsList"]
    cl_tweet = []
    for word in tweet:
        m = re.match("https://\S+|http://\S+|bit.ly\S+", word)
        if word[0] not in ['@', '#'] and not m:
            cl_tweet.append(word)
    return ' '.join(cl_tweet)

In [None]:
def count_urls(row):
    tweet = row["TweetWordsList"]
    urls = 0
    for word in tweet:
        m = re.match("https://\S+|http://\S+|bit.ly\S+", word)
        if m:
            urls += 1
    return urls

In [None]:
def count_hashtags(row):
    tweet = row["TweetWordsList"]
    hashtags = 0
    for word in tweet:
        if word[0] == '#':
            hashtags += 1
    return hashtags

In [None]:
def count_mentions(row):
    tweet = row["TweetWordsList"]
    mentions = 0
    for word in tweet:
        if word[0] == '@':
            mentions += 1
    return mentions

In [None]:
def count_uppercases(row):
    tweet = row["CleanTweetWordsList"]
    uppercases = 0
    for word in tweet:
        if word[0].isupper():
            uppercases += 1
    return uppercases

In [None]:
def longest_word(row):
    tweet = row["CleanTweetWordsList"]
    maxword = 0
    for word in tweet:
        if len(word) > maxword:
            maxword = len(word)
    return maxword

In [None]:
def average_word(row):
    tweet = row["CleanTweetWordsList"]
    sum = 0
    count = 0
    for word in tweet:
        sum += len(word)
        count += 1
    try:
        avg = sum/count
        return avg
    except ZeroDivisionError as e:
        return 0

In [None]:
def count_spamwords(row):
    tweet = row["CleanTweetWordsList"]
    spam = 0
    for word in tweet:
        if word in spamwords:
            spam += 1
    return spam

In [None]:
data["TweetWordsList"] = data["Tweet"].str.split()
data['CleanTweet'] = data.apply(clean_tweet,axis=1)
data["CleanTweetWordsList"] = data["CleanTweet"].str.split()

In [None]:
#Content features
data["NumberOfWords"] = data["CleanTweet"].str.count(' ') + 1
data["NumberOfWords"].replace(1, 0, inplace = True)

data["NumberOfSymbols"] = data["Tweet"].str.len()

data["NumberOfSpaces"] = data["Tweet"].str.count(' ')

data["NumberOfCl"] = data.apply(count_uppercases,axis=1)

data["ClperWord"] = data["NumberOfCl"].divide(data["NumberOfWords"])
data["ClperWord"].replace(np.inf, data["NumberOfCl"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data['MaxWord'] = data.apply(longest_word,axis=1)

data['AvgWord'] = data.apply(average_word,axis=1)

data["NumberOfEm"] = data["Tweet"].str.count('\!')

data["NumberOfQm"] = data["Tweet"].str.count('\?')

data["NumberOfURLs"] = data.apply(count_urls,axis=1)

data["URLperWord"] = data["NumberOfURLs"].divide(data["NumberOfWords"])
data["URLperWord"].replace(np.inf, data["NumberOfURLs"], inplace = True)
data.replace(np.nan, 0, inplace = True)


data["NumberOfMentions"] = data.apply(count_mentions,axis=1)

data["MentionsperWord"] = data["NumberOfMentions"].divide(data["NumberOfWords"])
data["MentionsperWord"].replace(np.inf, data["NumberOfMentions"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["NumberOfHashtags"] = data.apply(count_hashtags,axis=1)

data["HashtagsperWord"] = data["NumberOfHashtags"].divide(data["NumberOfWords"])
data["HashtagsperWord"].replace(np.inf, data["NumberOfHashtags"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["NumberOfSpamWords"] = data.apply(count_spamwords,axis=1)

data["SpamWordsperWord"] = data["NumberOfSpamWords"].divide(data["NumberOfWords"])
data["SpamWordsperWord"].replace(np.inf, data["NumberOfSpamWords"], inplace = True)
data.replace(np.nan, 0, inplace = True)

In [None]:
# Drop Tweets longer than 140 chars 
data = data.drop(data[data.NumberOfSymbols > 140].index)

In [None]:
plt.rcParams['figure.figsize'] = (20,7)
plt.rcParams['font.size'] = 25   

data0=data[data.Reputation > .1]
plt.hist([data0[data0.IsSpammer==1].Reputation.values,
          data0[data0.IsSpammer==0].Reputation.values],
          label = ['Спамеры', 'Легитимные аккаунты'],
          alpha = .99)
plt.legend()
plt.xlabel("Репутация")
plt.ylabel("Количество пользователей")
plt.show()

In [None]:
feature_cols = { "UserFeatures" : [
                                "LengthOfScreenName",
                                "LengthOfDesc",
                                "NumberOfFollowings",
                                "NumberOfFollowers",
                                "NumberOfTweets",
                                "AccountAge",
                                "Flwrs/Flwngs",
                                "Reputation",
                                "FlwngsIncrease",
                                "TweetsPerDay",
                                "TweetsPerWeek"
                                ],
               "ContentFeatures":[ 
                                  "NumberOfWords",
                                  "NumberOfSymbols",
                                  "NumberOfSpaces",
                                  "NumberOfCl",
                                  "ClperWord",
                                  "MaxWord",
                                  "AvgWord",
                                  "NumberOfEm",
                                  "NumberOfQm",
                                  "NumberOfURLs",
                                  "URLperWord",
                                  "NumberOfHashtags",
                                  "HashtagsperWord",
                                  "NumberOfMentions",
                                  "MentionsperWord",
                                  "NumberOfSpamWords",
                                  "SpamWordsperWord"
                                ],
 
               }
X = data[feature_cols["UserFeatures"] + feature_cols["ContentFeatures"]]
Y = data['IsSpammer']

In [None]:
#Normalize data
X = (X - X.mean())/ ((X.max() - X.min())/2)

In [None]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, train_size=0.4, random_state = 0)

In [None]:
def print_report(RealValues, PredictedValues ):
    print('Accuracy: ' + str(metrics.accuracy_score(RealValues, PredictedValues)))
    print('\n' + 'Classification report:')
    print(metrics.classification_report(RealValues, PredictedValues))
    print('\n' + 'Confusion matrix: ')
    print(metrics.confusion_matrix(RealValues, PredictedValues))

def print_cross_validation(model, X, Y):
    scores = cross_validation.cross_val_score(model, X, Y, cv=10, scoring='accuracy')
    print(scores)
    print('Cross Validation Score: ' + str(scores.mean()))

In [None]:
cv = cross_validation.StratifiedShuffleSplit(Y_train, n_iter = 10, test_size = 0.2, random_state = 0)

## Naive Bayes

In [None]:
bern = naive_bayes.BernoulliNB()
bern.get_params().keys()

In [None]:
parameters_grid = {
    'fit_prior' : [True, False],
}

In [None]:
grid_cv = grid_search.GridSearchCV(bern, parameters_grid, scoring = 'accuracy', cv = cv)

In [None]:
%%time
grid_cv.fit(X_train, Y_train)

In [None]:
best_estimator = grid_cv.best_estimator_
Y_pred = best_estimator.predict(X_test)
print_report(Y_test, Y_pred)
print_cross_validation(best_estimator, X, Y)

In [None]:
print (grid_cv.best_score_)
print (grid_cv.best_params_)

In [None]:
grid_cv.grid_scores_

## KNN

In [None]:
knn = neighbors.KNeighborsClassifier()

In [None]:
parameters_grid = {
    'n_neighbors' : [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights' :  ['uniform', 'distance'],
}

In [None]:
grid_cv = grid_search.GridSearchCV(knn, parameters_grid, scoring = 'accuracy', cv = cv)

In [None]:
%%time
grid_cv.fit(X_train, Y_train)

In [None]:
best_estimator = grid_cv.best_estimator_
Y_pred = best_estimator.predict(X_test)
print_report(Y_test, Y_pred)
print_cross_validation(best_estimator, X, Y)

In [None]:
print (grid_cv.best_score_)
print (grid_cv.best_params_)

In [None]:
grid_cv.grid_scores_

## SVM

In [None]:
svc = svm.svc()

In [None]:
 parameters_grid = {
     'kernel' : ['linear', 'poly', 'rbf', 'sigmoid' ]
}

In [None]:
grid_cv = grid_search.GridSearchCV(svc, parameters_grid, scoring = 'accuracy', cv = cv)

In [None]:
%%time
grid_cv.fit(X_train, Y_train)

In [None]:
best_estimator = grid_cv.best_estimator_
Y_pred = best_estimator.predict(X_test)
print_report(Y_test, Y_pred)
print_cross_validation(best_estimator, X, Y)

In [None]:
print (grid_cv.best_score_)
print (grid_cv.best_params_)

In [None]:
grid_cv.grid_scores_

## Decision Tree

In [None]:
dt = tree.DecisionTreeClassifier()
dt.get_params().keys()

In [None]:
parameters_grid = {
    'criterion'  : ['gini', 'entropy'],
    'splitter' : ['best','random'],
    'max_depth' : [1, 2, 3, None]
}

In [None]:
grid_cv = grid_search.GridSearchCV(dt, parameters_grid, scoring = 'accuracy', cv = cv)

In [None]:
%%time
grid_cv.fit(X_train, Y_train)

In [None]:
best_estimator = grid_cv.best_estimator_
Y_pred = best_estimator.predict(X_test)
print_report(Y_test, Y_pred)
print_cross_validation(best_estimator, X, Y)

In [None]:
print (grid_cv.best_score_)
print (grid_cv.best_params_)

In [None]:
grid_cv.grid_scores_

## Random Forest

In [None]:
rfc = ensemble.RandomForestClassifier()
#rfc.get_params().keys()

In [None]:
parameters_grid = {
    'n_estimators' : list(range(1,20)),
    'criterion' : ['gini','entropy'],
    'max_depth' : [1,2,3,4,5,None],
    'bootstrap' : [True, False],
}

In [None]:
grid_cv = grid_search.GridSearchCV(rfc, parameters_grid, scoring = 'accuracy', cv = cv)

In [None]:
%%time
grid_cv.fit(X_train, Y_train)

In [None]:
best_estimator = grid_cv.best_estimator_
Y_pred = best_estimator.predict(X_test)
print_report(Y_test, Y_pred)
print_cross_validation(best_estimator, X, Y)

In [None]:
print (grid_cv.best_score_)
print (grid_cv.best_params_)

In [None]:
grid_cv.grid_scores_[-20:]

In [None]:
importances  = best_estimator.feature_importances_
std = np.std([best_estimator.feature_importances_ for tree in best_estimator.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]


# Plot the feature importances of the forest

plt.rcParams['figure.figsize'] = (20,7)
plt.rcParams['font.size'] = 15 
plt.figure()
plt.title("Полезность признаков")
plt.bar(range(X_test.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_test.shape[1]), list(X_test[indices]), rotation='vertical')
plt.xlim([-1, X_test.shape[1]])
plt.gcf().subplots_adjust(bottom=0.5)
#plt.setp(legend.get_title(),fontsize='xx-small')
plt.savefig("featureImportance.png")
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20,7)
plt.rcParams['font.size'] = 25   

data0=data
plt.hist([data0[data0.IsSpammer==1].FlwngsIncrease.values,
          data0[data0.IsSpammer==0].FlwngsIncrease.values],
          label = ['Спамеры','Легитимные пользователи'],
          alpha = .99,
          bins=range(10,100, 5))
plt.legend()
plt.xlabel("Прирост подписок (ПС/ВА)")
plt.ylabel("Количество пользователей")
plt.savefig("flwngsincrease.png")
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20,7)
plt.rcParams['font.size'] = 25   

data0=data
plt.hist([data0[data0.IsSpammer==1].NumberOfFollowings.values,
          data0[data0.IsSpammer==0].NumberOfFollowings.values],
          label = ['Спамеры','Легитимные пользователи'],
          alpha = .99,
          bins=range(10,200, 10))
plt.legend()
plt.xlabel("Количество подписок")
plt.ylabel("Количество пользователей")
plt.savefig("NumberOfFollowings.png")
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20,7)
plt.rcParams['font.size'] = 25   

data0=data
plt.hist([data0[data0.IsSpammer==1].NumberOfTweets.values,
          data0[data0.IsSpammer==0].NumberOfTweets.values],
          label = ['Спамеры','Легитимные пользователи'],
          alpha = .99,
          bins=range(10,1010, 10))
plt.legend()
plt.xlabel("Количество твитов")
plt.ylabel("Количество пользователей")
#plt.savefig("reputation.png")
plt.show()

In [None]:
def result(row):
    if not row['IsCorrect'] and row['Actual'] == 0:
        return "FalsePositive"
    elif not row['IsCorrect'] and row['Actual'] == 1:
        return "FalseNegative"
    elif row['IsCorrect'] and row['Actual'] == 1:
        return "TruePositive"
    elif row['IsCorrect'] and row['Actual'] == 0:
        return "TrueNegative"

In [None]:
Compare = {"Predicted" : Y_pred, 'Actual': Y_test, 'IsCorrect': Y_pred == Y_test}
Tests = pd.DataFrame(data=Compare)
Tests['Result'] = Tests.apply(result,axis=1) 
Tests

In [None]:
Tests.loc[Tests['Result'] == 'FalsePositive']

In [None]:
FalsePositives = data.loc[Tests.loc[Tests['Result'] == 'FalsePositive'].index]  # .to_latex()
URLS = 100. * FalsePositives.NumberOfURLs.value_counts() / len(FalsePositives.NumberOfURLs)
URLS

In [None]:
TweetExamples = FalsePositives.loc[[83076908,58205927,27059567,27094415,674433]]
TweetExamples.Tweet

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(x = data['AccountAge'].loc[data['IsSpammer'] == 1], y=data['NumberOfFollowings'].loc[data['IsSpammer'] == 1],   color = 'Red', label = "Спамовые аккаунты")
ax1.scatter(x = data['AccountAge'].loc[data['IsSpammer'] == 0], y=data['NumberOfFollowings'].loc[data['IsSpammer'] == 0],  color = 'Green', label= "Легитимные аккаунты")
ax1.scatter(x = data['AccountAge'].loc[Tests.loc[Tests['Result'] == 'FalseNegative'].index], y=data['NumberOfFollowings'].loc[Tests.loc[Tests['Result'] == 'FalseNegative'].index],  color = 'Yellow', label= "False Negatives")
ax1.scatter(x = data['AccountAge'].loc[Tests.loc[Tests['Result'] == 'FalsePositive'].index], y=data['NumberOfFollowings'].loc[Tests.loc[Tests['Result'] == 'FalsePositive'].index], color = 'Black', label= "False Positives")
plt.xlabel("Возраст аккаунта")
plt.ylabel("Количествоподписок")
plt.legend(loc='upper right');
plt.savefig("ageincreasewitherrors")
plt.show()

## Feature extraction time

In [None]:
data = pd.concat([spammers, nonspammers])
data = data[:9000]

In [None]:
%%time
#User features

# LengthOfScreenName
# LengthOfDescriptionInUserProfile
# NumberofFollowings
# NumberofFollowers
# NumberofTweets
data["ProfileCreatedAt"] = pd.to_datetime(data["ProfileCreatedAt"])
data["ProfileCollectedAt"] = pd.to_datetime(data["ProfileCollectedAt"])
data["AccountAge"] = (data["ProfileCollectedAt"] - data["ProfileCreatedAt"]).astype('timedelta64[h]')

data["Flwrs/Flwngs"] = data["NumberOfFollowers"].divide(data["NumberOfFollowings"], fill_value=True)
data["Flwrs/Flwngs"].replace(np.inf, data["NumberOfFollowers"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["Reputation"] = data["NumberOfFollowers"].divide((data["NumberOfFollowers"] + data["NumberOfFollowings"]), fill_value=True)
data["Reputation"].replace(np.inf, data["NumberOfFollowers"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["FlwngsIncrease"] = data["NumberOfFollowings"].divide(data["AccountAge"])
data["FlwngsIncrease"].replace(np.inf, data["NumberOfFollowings"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["TweetsPerDay"]  = data["NumberOfTweets"].divide((data["AccountAge"] / 24), fill_value=True)
data["TweetsPerDay"].replace(np.inf, data["NumberOfTweets"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["TweetsPerWeek"]  = data["NumberOfTweets"].divide((data["AccountAge"] / (24 * 7)), fill_value=True)
data["TweetsPerWeek"].replace(np.inf, data["NumberOfTweets"], inplace = True)
data.replace(np.nan, 0, inplace = True)

In [None]:
data["TweetWordsList"] = data["Tweet"].str.split()
data['CleanTweet'] = data.apply(clean_tweet,axis=1)
data["CleanTweetWordsList"] = data["CleanTweet"].str.split()

In [None]:
%%time
#Content features

data["NumberOfWords"] = data["CleanTweet"].str.count(' ') + 1
data["NumberOfWords"].replace(1, 0, inplace = True)

data["NumberOfSymbols"] = data["Tweet"].str.len()

data["NumberOfSpaces"] = data["Tweet"].str.count(' ')

data["NumberOfCl"] = data.apply(count_uppercases,axis=1)

data["ClperWord"] = data["NumberOfCl"].divide(data["NumberOfWords"])
data["ClperWord"].replace(np.inf, data["NumberOfCl"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data['MaxWord'] = data.apply(longest_word,axis=1)

data['AvgWord'] = data.apply(average_word,axis=1)

data["NumberOfEm"] = data["Tweet"].str.count('\!')

data["NumberOfQm"] = data["Tweet"].str.count('\?')

data["NumberOfURLs"] = data.apply(count_urls,axis=1)

data["URLperWord"] = data["NumberOfURLs"].divide(data["NumberOfWords"])
data["URLperWord"].replace(np.inf, data["NumberOfURLs"], inplace = True)
data.replace(np.nan, 0, inplace = True)


data["NumberOfMentions"] = data.apply(count_mentions,axis=1)

data["MentionsperWord"] = data["NumberOfMentions"].divide(data["NumberOfWords"])
data["MentionsperWord"].replace(np.inf, data["NumberOfMentions"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["NumberOfHashtags"] = data.apply(count_hashtags,axis=1)

data["HashtagsperWord"] = data["NumberOfHashtags"].divide(data["NumberOfWords"])
data["HashtagsperWord"].replace(np.inf, data["NumberOfHashtags"], inplace = True)
data.replace(np.nan, 0, inplace = True)

data["NumberOfSpamWords"] = data.apply(count_spamwords,axis=1)

data["SpamWordsperWord"] = data["NumberOfSpamWords"].divide(data["NumberOfWords"])
data["SpamWordsperWord"].replace(np.inf, data["NumberOfSpamWords"], inplace = True)
data.replace(np.nan, 0, inplace = True)