# Twitter Spam Classifier

## Dataset - Social honeypot icwsm 2011

In [1]:
!curl -O 'http://infolab.tamu.edu/static/users/kyumin/social_honeypot_icwsm_2011.zip'&& unzip social_honeypot_icwsm_2011.zip

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.calibration import calibration_curve
from sklearn.feature_selection import SelectFromModel

pd.set_option("display.max_columns",101)

In [None]:
#Спамеры
content_polluters = pd.read_table('./content_polluters.txt', header=None, names = [
    "UserID",
    "ProfileCreatedAt",
    "ProfileCollectedAt",
    "NumberOfFollowings",
    "NumberOfFollowers",
    "NumberOfTweets",
    "LengthOfScreenName",
    "LengthOfDescriptionInUserProfile"])
content_polluters_tweets = pd.read_table('./content_polluters_tweets.txt', header=None, names = [
    "UserID",
    "TweetID",
    "Tweet",
    "TweetCreatedAt"
])

In [None]:
# Пользователь 1:1 Твит (последний по дате публикации)
content_polluters = content_polluters.set_index("UserID")
content_polluters_tweets_distinct = content_polluters_tweets.sort_values('TweetCreatedAt', ascending = False).groupby(['UserID']).first()

In [None]:
#Легитимные пользователи
legitimate_users = pd.read_table('./legitimate_users.txt', header=None, names = [
    "UserID",
    "ProfileCreatedAt",
    "ProfileCollectedAt",
    "NumberOfFollowings",
    "NumberOfFollowers",
    "NumberOfTweets",
    "LengthOfScreenName",
    "LengthOfDescriptionInUserProfile"])
legitimate_users_tweets = pd.read_table('./legitimate_users_tweets.txt', header=None, names = [
    "UserID",
    "TweetID",
    "Tweet",
    "TweetCreatedAt"
])

In [None]:
# Пользователь 1:1 Твит (последний по дате публикации)
legitimate_users = legitimate_users.set_index("UserID")
legitimate_users_tweets_distinct = legitimate_users_tweets.sort_values('TweetCreatedAt', ascending = False).groupby(['UserID']).first()

In [None]:
spammers = pd.concat([content_polluters, content_polluters_tweets_distinct], axis=1, join='inner')
spammers['IsSpammer'] = 1

In [None]:
nonspammers =  pd.concat([legitimate_users, legitimate_users_tweets_distinct], axis=1, join='inner')
nonspammers['IsSpammer'] = 0

In [None]:
with open("blacklist.txt", "r") as f:
    spamwords=f.read().splitlines()


In [None]:
data = pd.concat([spammers, nonspammers])
data["ProfileCreatedAt"] = pd.to_datetime(data["ProfileCreatedAt"])
data["TweetCreatedAt"] = pd.to_datetime(data["TweetCreatedAt"])
data["ProfileCollectedAt"] = pd.to_datetime(data["ProfileCollectedAt"])
data["Reputation"] = data["NumberOfFollowers"].divide((data["NumberOfFollowers"] + data["NumberOfFollowings"]), fill_value=True)
data["AccountAge"] = (data["ProfileCollectedAt"] - data["ProfileCreatedAt"]).astype('timedelta64[h]')
data["Flwrs/Flwngs"] = data["NumberOfFollowers"].divide(data["NumberOfFollowings"], fill_value=True)
data["FlwngsIncrease"] = data["NumberOfFollowings"].divide(data["AccountAge"])
data["TweetsPerDay"]  = data["NumberOfTweets"].divide((data["AccountAge"] / 24), fill_value=True)
data["TweetsPerWeek"]  = data["NumberOfTweets"].divide((data["AccountAge"] / (24 * 7)), fill_value=True)
data["Tweet"].str.extract('(http://\S+|https://\S+)', expand=True)
data["Tweet"].str.extract('(@\S+)', expand=True)
data["Tweet"].str.extract('(#\S+)', expand=True)
data["NumberOfSpaces"] = data["Tweet"].str.count(' ')
data["NumberOfWords"] = data["Tweet"].str.count(' ') + 1
data["NumberOfSymbols"] = data["Tweet"].str.len()
data["NumberOfQm"] = data["Tweet"].str.count('\?')
data["NumberOfEm"] = data["Tweet"].str.count('\!')
data["NumberOfURLs"] = data["Tweet"].str.count('(https://\S+|http://\S+)')
data["NumberOfMentions"] = data["Tweet"].str.count('(@\S+)')
data["NumberOfHashtags"] = data["Tweet"].str.count('(#\S+)')
#data["NumberOfSpamWords"] = data["Tweet"].str.count('(' + '|'.join(spamwords) + ')')
data.replace(np.inf, np.nan, inplace = True)
data.fillna(0, inplace = True)

In [None]:
plt.rcParams['figure.figsize'] = (20,7)
plt.rcParams['font.size'] = 25   

data0=data[data.Reputation > .1]
plt.hist([data0[data0.IsSpammer==1].Reputation.values,
          data0[data0.IsSpammer==0].Reputation.values],label=["Спамер","Легитимный пользователь"],
          alpha = .99)
plt.legend()
plt.xlabel("Репутация")
plt.ylabel("Количество пользователей")
plt.savefig("./pics/reputation.png")
plt.show()

In [None]:
feature_cols = { "UserFeatures" : [
                                "LengthOfScreenName",
                                "LengthOfDescriptionInUserProfile",
                                "NumberOfFollowings",
                                "NumberOfFollowers",
                                "NumberOfTweets",
                                "AccountAge",
                                "Flwrs/Flwngs",
                                "FlwngsIncrease",
                                "TweetsPerDay",
                                "TweetsPerWeek"
                                ],
               "ContentFeatures":[ 
                                  "NumberOfWords",
                                  "NumberOfSymbols",
                                  "NumberOfSpaces",
                                  "NumberOfEm",
                                  "NumberOfQm",
                                  "NumberOfURLs",
                                  "NumberOfHashtags",
                                  "NumberOfMentions"
                                ],
 
               }
X = data[feature_cols["UserFeatures"] + feature_cols["ContentFeatures"]]
Y = data['IsSpammer']
X.head()

In [None]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1)

In [None]:
def print_report(RealValues, PredictedValues ):
    print('Accuracy: ' + str(metrics.accuracy_score(RealValues, PredictedValues)))
    #print('\n' + 'Classification report:')
    #print(metrics.classification_report(RealValues, PredictedValues))
    #print('\n' + 'Confusion matrix: ')
    #print(metrics.confusion_matrix(RealValues, PredictedValues))

def print_cross_validation(model, X, Y):
    scores = cross_val_score(model, X, Y, cv=10, scoring='accuracy')
    #print(scores)
    print('Cross Validation Score: ' + str(scores.mean()))

## KNN

In [None]:
for k in range(1,20, 2):
    print("k = " + str(k))
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    print_report(Y_test, Y_pred)
    print_cross_validation(knn, X, Y)
    print()

## Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred = dt.predict(X_test)
print_report(Y_test, Y_pred)

In [None]:
print_cross_validation(dt, X, Y)

In [None]:
model = SelectFromModel(dt, prefit=True)
X_new = model.transform(X_train)
print(X_train.shape)
print(X_new.shape)
dt.fit(X_new, Y_train)
Y_pred = dt.predict(X_test)
print_report(Y_test, Y_pred)

## Naive Bayes

In [None]:
bern = BernoulliNB()
bern.fit(X_train, Y_train)
Y_pred = bern.predict(X_test)
print_report(Y_test, Y_pred)

In [None]:
print_cross_validation(bern, X, Y)

## Random Forest

In [None]:
X_train[['NumberOfURLs','FlwngsIncrease','Flwrs/Flwngs','NumberOfFollowings']]

In [None]:
from itertools import chain, combinations
import copy

best_score = -np.inf
best_subset = None

def combinations(target, data, X, y, estimator):
    for i in range(len(data)):
        new_target = copy.copy(target)
        new_data = copy.copy(data)
        new_target.append(data[i])
        new_data = data[i+1:]
        print(new_target)
        
        score = cross_val_score(estimator, X[:, list(new_target)], y, cv=cv).mean()
        if score > best_score:
            best_score, best_subset = score, new_target
        combinations(new_target,
                      new_data, X, y, estimator)
        return best_subset, best_score


    



rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
print(combinations([],X_train.columns, X_train, Y_train, rfc))


In [None]:
import copy

                      
            
target = []
data = X.columns
 
combinations(target,data)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train[['NumberOfURLs','FlwngsIncrease','Flwrs/Flwngs','NumberOfFollowings']], Y_train)
Y_pred = rfc.predict(X_test[['NumberOfURLs','FlwngsIncrease','Flwrs/Flwngs','NumberOfFollowings']])
print_report(Y_test, Y_pred)
print_cross_validation(rfc, X, Y)

plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(knn, 'kNN'),
                  (dt, 'Decision Tree'),
                  (bern, 'Naive Bayes'),
                  (rfc, 'Random Forest')]:
    clf.fit(X_train, Y_train)
    if hasattr(clf, "predict_proba"):
        prob_pos = clf.predict_proba(X_test)[:, 1]
    else:  # use decision function
        prob_pos = clf.decision_function(X_test)
        prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    fraction_of_positives, mean_predicted_value = \
        calibration_curve(Y_test, prob_pos, n_bins=10)

    ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
             label="%s" % (name, ))

    ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
             histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)

plt.tight_layout()
plt.show()

## SVM

In [None]:
#svm = SVC()
#svm.fit(X_train, Y_train)
#Y_pred = svm.predict(X_test)
#print_report(Y_test, Y_pred)

In [None]:
#scores = cross_val_score(svm, X, Y, cv=10, scoring='accuracy')
#print(scores)
#print(scores.mean()) 