In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gender_guesser.detector as gender
import pandas as pd
import math
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif
import seaborn as sns
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [2]:
def read_datasets():
    """ Reads users profile from csv files """
    genuine_users = pd.read_csv("data/users.csv")
    fake_users = pd.read_csv("data/fusers.csv")
    # print genuine_users.columns
    # print genuine_users.describe()
    #print fake_users.describe()
    x=pd.concat([genuine_users,fake_users])   
    y=len(fake_users)*[0] + len(genuine_users)*[1]
    return x,y

In [3]:
def predict_sex(name):
    sex_predictor = gender.Detector(case_sensitive=False)
    first_name= name.str.split(' ').str.get(0)
    sex= first_name.apply(sex_predictor.get_gender)
    sex_dict={'female':-2,'mostly_female':-1,'unknown':0,'mostly_male':1,'male':2,'andy':3}
    sex_code = sex.map(sex_dict).astype(int)
    return sex_code

In [4]:
def extract_features(x):
    lang_list = list(enumerate(np.unique(x['lang'])))   
    lang_dict = { name : i for i, name in lang_list }             
    x.loc[:,'lang_code'] = x['lang'].map( lambda x: lang_dict[x]).astype(int)    
    x.loc[:,'sex_code']=predict_sex(x['name'])
    feature_columns_to_use = ['statuses_count','followers_count','friends_count','favourites_count','listed_count','sex_code','lang_code']
    x=x.loc[:,feature_columns_to_use]
    return x

In [5]:
x,y=read_datasets()

In [6]:
x=extract_features(x)
print(x.columns)
print(x.describe())

Index(['statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'sex_code', 'lang_code'],
      dtype='object')
       statuses_count  followers_count  friends_count  favourites_count  \
count     2818.000000      2818.000000    2818.000000       2818.000000   
mean      1672.198368       371.105039     395.363023        234.541164   
std       4884.669157      8022.631339     465.694322       1445.847248   
min          0.000000         0.000000       0.000000          0.000000   
25%         35.000000        17.000000     168.000000          0.000000   
50%         77.000000        26.000000     306.000000          0.000000   
75%       1087.750000       111.000000     519.000000         37.000000   
max      79876.000000    408372.000000   12773.000000      44349.000000   

       listed_count     sex_code    lang_code  
count   2818.000000  2818.000000  2818.000000  
mean       2.818666    -0.136977     2.851313  
std       23.480430     1.73

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.20, random_state=44)

In [None]:
'''svc=SVC()
svc.fit(X_train,y_train)
pred=svc.predict(x_test)
accuracy_score(y_test,pred)'''

In [8]:
def cm(y_test,pred):
    c=confusion_matrix(y_test, pred)
    print(c)

In [10]:
names = ["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    SVC()
]

models = zip(names, classifiers)

for name, m in models:
    
    m.fit(x_train,y_train)
    pred=m.predict(x_test)
    r=recall_score(y_test,pred)
    a=accuracy_score(y_test,pred)
    print('model:',name)
    print('recall_score:%.2f,accuracy_score:%.2f'%(r,a))
    cm(y_test,m.predict(x_test))
    print('\n')

model: KNN
recall_score:0.91,accuracy_score:0.94
[[261   7]
 [ 27 269]]


model: Decision Tree
recall_score:0.92,accuracy_score:0.90
[[239  29]
 [ 25 271]]


model: Random Forest
recall_score:0.90,accuracy_score:0.94
[[265   3]
 [ 30 266]]






model: Logistic Regression
recall_score:0.91,accuracy_score:0.90
[[241  27]
 [ 28 268]]


model: SGD Classifier
recall_score:0.90,accuracy_score:0.89
[[236  32]
 [ 31 265]]


model: SVM Linear
recall_score:0.58,accuracy_score:0.78
[[268   0]
 [124 172]]


