## Detect fake profiles in online social networks using Support Vector Machine

In [15]:
import sys
import csv
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import gender_guesser.detector as gender
from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
%matplotlib inline 

###### function for reading dataset from csv files

In [16]:
def read_datasets():
    """ Reads users profile from csv files """
    genuine_users = pd.read_csv("data/users.csv")
    fake_users = pd.read_csv("data/fusers.csv")
    # print genuine_users.columns
    # print genuine_users.describe()
    #print fake_users.describe()
    x=pd.concat([genuine_users,fake_users])   
    y=len(fake_users)*[0] + len(genuine_users)*[1]
    return x,y
    

###### function for predicting sex using name of person

In [17]:
def predict_sex(name):
    sex_predictor = gender.Detector(case_sensitive=False)
    first_name= name.str.split(' ').str.get(0)
    sex= first_name.apply(sex_predictor.get_gender)
    sex_dict={'female':-2,'mostly_female':-1,'unknown':0,'mostly_male':1,'male':2,'andy':3}
    sex_code = sex.map(sex_dict).astype(int)
    return sex_code

###### function for feature engineering

In [18]:
def extract_features(x):
    lang_list = list(enumerate(np.unique(x['lang'])))   
    lang_dict = { name : i for i, name in lang_list }             
    x.loc[:,'lang_code'] = x['lang'].map( lambda x: lang_dict[x]).astype(int)    
    x.loc[:,'sex_code']=predict_sex(x['name'])
    feature_columns_to_use = ['statuses_count','followers_count','friends_count','favourites_count','listed_count','sex_code','lang_code']
    x=x.loc[:,feature_columns_to_use]
    return x

###### function for ploting learning curve

In [5]:
'''def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt'''

'def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,\n                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):\n    plt.figure()\n    plt.title(title)\n    if ylim is not None:\n        plt.ylim(*ylim)\n    plt.xlabel("Training examples")\n    plt.ylabel("Score")\n    train_sizes, train_scores, test_scores = learning_curve(\n        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)\n    train_scores_mean = np.mean(train_scores, axis=1)\n    train_scores_std = np.std(train_scores, axis=1)\n    test_scores_mean = np.mean(test_scores, axis=1)\n    test_scores_std = np.std(test_scores, axis=1)\n    plt.grid()\n\n    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,\n                     train_scores_mean + train_scores_std, alpha=0.1,\n                     color="r")\n    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,\n                     test_scores_mean + test_scores_std, alpha=0.1, color="g")\n    plt.plo

###### function for plotting confusion matrix

In [6]:
'''def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    target_names=['Fake','Genuine']
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')'''

"def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):\n    target_names=['Fake','Genuine']\n    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n    plt.title(title)\n    plt.colorbar()\n    tick_marks = np.arange(len(target_names))\n    plt.xticks(tick_marks, target_names, rotation=45)\n    plt.yticks(tick_marks, target_names)\n    plt.tight_layout()\n    plt.ylabel('True label')\n    plt.xlabel('Predicted label')"

###### function for plotting ROC curve

In [7]:
'''def plot_roc_curve(y_test, y_pred):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    print("False Positive rate: ",false_positive_rate)
    print("True Positive rate: ",true_positive_rate)
    
    roc_auc = auc(false_positive_rate, true_positive_rate)

    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()'''

'def plot_roc_curve(y_test, y_pred):\n    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)\n    print("False Positive rate: ",false_positive_rate)\n    print("True Positive rate: ",true_positive_rate)\n    \n    roc_auc = auc(false_positive_rate, true_positive_rate)\n\n    plt.title(\'Receiver Operating Characteristic\')\n    plt.plot(false_positive_rate, true_positive_rate, \'b\',\n    label=\'AUC = %0.2f\'% roc_auc)\n    plt.legend(loc=\'lower right\')\n    plt.plot([0,1],[0,1],\'r--\')\n    plt.xlim([-0.1,1.2])\n    plt.ylim([-0.1,1.2])\n    plt.ylabel(\'True Positive Rate\')\n    plt.xlabel(\'False Positive Rate\')\n    plt.show()'

###### Function for training data using Support Vector Machine

In [8]:
'''def train(X_train,y_train,X_test):
    """ Trains and predicts dataset with a SVM classifier """
    # Scaling features
    X_train=preprocessing.scale(X_train)
    X_test=preprocessing.scale(X_test)

    Cs = 10.0 ** np.arange(-2,3,.5)
    gammas = 10.0 ** np.arange(-2,3,.5)
    param = [{'gamma': gammas, 'C': Cs}]
    cvk = KFold(n_splits=5,shuffle=True)
    classifier = SVC()
    clf = GridSearchCV(classifier,param_grid=param,cv=cvk)
    clf.fit(X_train,y_train)
    print("The best classifier is: ",clf.best_estimator_)
    clf.best_estimator_.fit(X_train,y_train)
    # Estimate score
    scores = cross_validation.cross_val_score(clf.best_estimator_, X_train,y_train, cv=5)
    print(scores)
    print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2))
    title = 'Learning Curves (SVM, rbf kernel, $\gamma=%.6f$)' %clf.best_estimator_.gamma
    plot_learning_curve(clf.best_estimator_, title, X_train, y_train, cv=5)
    plt.show()
    # Predict class
    y_pred = clf.best_estimator_.predict(X_test)
    return y_test,y_pred'''

'def train(X_train,y_train,X_test):\n    """ Trains and predicts dataset with a SVM classifier """\n    # Scaling features\n    X_train=preprocessing.scale(X_train)\n    X_test=preprocessing.scale(X_test)\n\n    Cs = 10.0 ** np.arange(-2,3,.5)\n    gammas = 10.0 ** np.arange(-2,3,.5)\n    param = [{\'gamma\': gammas, \'C\': Cs}]\n    cvk = KFold(n_splits=5,shuffle=True)\n    classifier = SVC()\n    clf = GridSearchCV(classifier,param_grid=param,cv=cvk)\n    clf.fit(X_train,y_train)\n    print("The best classifier is: ",clf.best_estimator_)\n    clf.best_estimator_.fit(X_train,y_train)\n    # Estimate score\n    scores = cross_validation.cross_val_score(clf.best_estimator_, X_train,y_train, cv=5)\n    print(scores)\n    print(\'Estimated score: %0.5f (+/- %0.5f)\' % (scores.mean(), scores.std() / 2))\n    title = \'Learning Curves (SVM, rbf kernel, $\\gamma=%.6f$)\' %clf.best_estimator_.gamma\n    plot_learning_curve(clf.best_estimator_, title, X_train, y_train, cv=5)\n    plt.show()\n 

In [19]:
print("reading datasets.....\n")
x,y=read_datasets()

reading datasets.....



In [20]:
print("extracting featues.....\n")
x=extract_features(x)
print(x.columns)
print(x.describe())

extracting featues.....

Index(['statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'sex_code', 'lang_code'],
      dtype='object')
       statuses_count  followers_count  friends_count  favourites_count  \
count     2818.000000      2818.000000    2818.000000       2818.000000   
mean      1672.198368       371.105039     395.363023        234.541164   
std       4884.669157      8022.631339     465.694322       1445.847248   
min          0.000000         0.000000       0.000000          0.000000   
25%         35.000000        17.000000     168.000000          0.000000   
50%         77.000000        26.000000     306.000000          0.000000   
75%       1087.750000       111.000000     519.000000         37.000000   
max      79876.000000    408372.000000   12773.000000      44349.000000   

       listed_count     sex_code    lang_code  
count   2818.000000  2818.000000  2818.000000  
mean       2.818666    -0.136977     2.851313  
std

In [21]:

X_train,X_test,y_train,y_test = train_test_split(x, y, test_size=0.20, random_state=44)

In [22]:
svc=SVC()
svc.fit(X_train,y_train)
pred=svc.predict(X_test)
accuracy_score(y_test,pred)



0.7801418439716312

In [None]:

y_test,y_pred = train(X_train,y_train,X_test)

In [None]:
print('accuracy score',accuracy_score(y_test, y_pred))

In [None]:
cm=confusion_matrix(y_test, y_pred)
print('Confusion matrix, without normalization')
print(cm)
plot_confusion_matrix(cm)

In [None]:
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')

In [None]:
print(classification_report(y_test, y_pred, target_names=['Fake','Genuine']))

In [None]:
plot_roc_curve(y_test, y_pred)