In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Reading the dataset

In [2]:
df = pd.read_csv('Trials.csv')

In [3]:
df.head(2)

Unnamed: 0,id,name,screen_name,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,...,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,utc_offset,protected,verified,description,updated,dataset
0,3610511,Davide Dellacasa,braddd,0,20370,5470,2385,145,52,Fri Apr 06 10:58:22 +0000 2007,...,FFF7CC,http://a0.twimg.com/profile_background_images/...,BADFCD,FF0000,3600.0,,,Founder of http://www.screenweek.it & http://w...,2/14/2015 10:54,1
1,5656162,Simone Economo,eKoeS,68,3131,506,381,9,40,Mon Apr 30 15:08:42 +0000 2007,...,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,3600.0,,,BSc degree (cum laude) in Computer Engineering...,2/14/2015 10:54,1


In [4]:
df.dtypes

id                                      int64
name                                   object
screen_name                            object
fav_number                              int64
statuses_count                          int64
followers_count                         int64
friends_count                           int64
favourites_count                        int64
listed_count                            int64
created_at                             object
url                                    object
lang                                   object
time_zone                              object
location                               object
default_profile                       float64
default_profile_image                 float64
geo_enabled                           float64
profile_image_url                      object
profile_banner_url                     object
profile_use_background_image          float64
profile_background_image_url_https     object
profile_text_color                

In [50]:
df['dataset'].value_counts()

1    1481
2    1337
Name: dataset, dtype: int64

In [51]:
real_df = df[df['dataset']==1][0:200]
fake_df = df[df['dataset']==2][0:200]

# Feature Engineering Date and Time 

In [52]:
df['created_at'] = pd.to_datetime(df.created_at)

In [53]:
df['Year'] = df.created_at.dt.year

In [54]:
df['Month'] = df.created_at.dt.month

In [55]:
df['Time'] = df.created_at.dt.time

In [56]:
df.head(2)

Unnamed: 0,id,name,screen_name,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,...,profile_link_color,utc_offset,protected,verified,description,updated,dataset,Year,Month,Time
0,3610511,573,1106,0,20370,5470,2385,145,52,2007-04-06 10:58:22+00:00,...,FF0000,3600.0,,,Founder of http://www.screenweek.it & http://w...,2/14/2015 10:54,1,2007,4,10:58:22
1,5656162,2251,1409,68,3131,506,381,9,40,2007-04-30 15:08:42+00:00,...,0084B4,3600.0,,,BSc degree (cum laude) in Computer Engineering...,2/14/2015 10:54,1,2007,4,15:08:42


# Label Encoding 

In [57]:
from sklearn import preprocessing

In [58]:
le = preprocessing.LabelEncoder()

In [59]:
le.fit(df['screen_name'])

LabelEncoder()

In [60]:
df['screen_name']=le.transform(df['screen_name'])

In [61]:
le.fit(df['name'])

LabelEncoder()

In [62]:
df['name'] = le.transform(df['name'])

In [63]:
le.fit(df['Time'])

LabelEncoder()

In [64]:
df['Time'] = le.transform(df['Time'])

In [65]:
df.head(2)

Unnamed: 0,id,name,screen_name,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,...,profile_link_color,utc_offset,protected,verified,description,updated,dataset,Year,Month,Time
0,3610511,573,1106,0,20370,5470,2385,145,52,2007-04-06 10:58:22+00:00,...,FF0000,3600.0,,,Founder of http://www.screenweek.it & http://w...,2/14/2015 10:54,1,2007,4,763
1,5656162,2251,1409,68,3131,506,381,9,40,2007-04-30 15:08:42+00:00,...,0084B4,3600.0,,,BSc degree (cum laude) in Computer Engineering...,2/14/2015 10:54,1,2007,4,1282


# Feature Engineering

In [66]:
df.columns

Index(['id', 'name', 'screen_name', 'fav_number', 'statuses_count',
       'followers_count', 'friends_count', 'favourites_count', 'listed_count',
       'created_at', 'url', 'lang', 'time_zone', 'location', 'default_profile',
       'default_profile_image', 'geo_enabled', 'profile_image_url',
       'profile_banner_url', 'profile_use_background_image',
       'profile_background_image_url_https', 'profile_text_color',
       'profile_image_url_https', 'profile_sidebar_border_color',
       'profile_background_tile', 'profile_sidebar_fill_color',
       'profile_background_image_url', 'profile_background_color',
       'profile_link_color', 'utc_offset', 'protected', 'verified',
       'description', 'updated', 'dataset', 'Year', 'Month', 'Time'],
      dtype='object')

In [67]:
features_df = df[['statuses_count','followers_count','friends_count','favourites_count','listed_count']]

In [68]:
features_df.dtypes

statuses_count      int64
followers_count     int64
friends_count       int64
favourites_count    int64
listed_count        int64
dtype: object

In [69]:
X = np.asarray(features_df)

In [70]:
y = np.asarray(df['dataset'])

In [71]:
X[0:6]

array([[20370,  5470,  2385,   145,    52],
       [ 3131,   506,   381,     9,    40],
       [ 4024,   264,    87,   323,    16],
       [40586,   640,   622,  1118,    32],
       [ 2016,    62,    64,    13,     0],
       [ 3603,   138,   179,    53,     1]], dtype=int64)

# Train Test Split

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 4)

In [74]:
X_train.shape

(2254, 5)

In [75]:
y_train.shape

(2254,)

In [76]:
X_test.shape

(564, 5)

In [77]:
y_test.shape

(564,)

# Support Vector Machine Algorithm

In [78]:
from sklearn.svm import SVC

In [79]:
classifier = SVC(kernel = 'poly',gamma = 'auto', C=2)

In [None]:
classifier.fit(X_train,y_train)

In [44]:
y_predict= classifier.predict(X_test)

# Classification / Accuracy Report and Confusion Matrix

In [45]:
from sklearn.metrics import classification_report, confusion_matrix

In [46]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           1       1.00      0.98      0.99       298
           2       0.98      1.00      0.99       266

    accuracy                           0.99       564
   macro avg       0.99      0.99      0.99       564
weighted avg       0.99      0.99      0.99       564



In [47]:
print(confusion_matrix(y_test, y_predict))

[[293   5]
 [  1 265]]


In [48]:
from sklearn.metrics import accuracy_score

In [49]:
print('Accuracy Score:',accuracy_score(y_test,y_predict))

Accuracy Score: 0.9893617021276596


In [42]:
print(y_predict)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 