In [277]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [278]:
df = pd.read_csv('Trials.csv')

# Making new columns from created_at

In [279]:
df['created_at'] = pd.to_datetime(df.created_at) 

In [280]:
df['month'] = df.created_at.dt.month

In [281]:
df['Year'] = df.created_at.dt.year

In [282]:
df['Time'] = df.created_at.dt.time

In [283]:
df.dtypes

id                                                  int64
name                                               object
screen_name                                        object
fav_number                                          int64
statuses_count                                      int64
followers_count                                     int64
friends_count                                       int64
favourites_count                                    int64
listed_count                                        int64
created_at                            datetime64[ns, UTC]
url                                                object
lang                                               object
time_zone                                          object
location                                           object
default_profile                                   float64
default_profile_image                             float64
geo_enabled                                       float64
profile_image_

# Label Encoding 

In [284]:
from sklearn import preprocessing 

In [285]:
le = preprocessing.LabelEncoder()

In [286]:
le.fit(df['name'])

LabelEncoder()

In [287]:
df['name'] = le.transform(df['name'])

In [288]:
le.fit(df['screen_name'])

LabelEncoder()

In [289]:
le.fit(df['screen_name'])

LabelEncoder()

In [290]:
df['screen_name'] = le.transform(df['screen_name'])

In [291]:
le.fit(df['Time'])

LabelEncoder()

In [292]:
df['Time'] = le.transform(df['Time'])

In [293]:
df.dtypes

id                                                  int64
name                                                int32
screen_name                                         int32
fav_number                                          int64
statuses_count                                      int64
followers_count                                     int64
friends_count                                       int64
favourites_count                                    int64
listed_count                                        int64
created_at                            datetime64[ns, UTC]
url                                                object
lang                                               object
time_zone                                          object
location                                           object
default_profile                                   float64
default_profile_image                             float64
geo_enabled                                       float64
profile_image_

# Feature Engineering

In [294]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [297]:
X = df[['statuses_count','followers_count','friends_count','favourites_count','listed_count']]

In [298]:
y = df['dataset']

# Best Featue Selection 

In [299]:
bestfeat = SelectKBest(score_func = chi2, k = 5)

In [300]:
fit = bestfeat.fit(X,y)

In [301]:
dfscores = pd.DataFrame(fit.scores_)

In [302]:
dfcolumns = pd.DataFrame(X.columns)

In [303]:
featuresScores = pd.concat([dfcolumns,dfscores], axis = 1 )

In [304]:
featuresScores.columns = ['Specs','Scores']

In [305]:
featuresScores

Unnamed: 0,Specs,Scores
0,statuses_count,4026007.0
1,followers_count,860343.2
2,friends_count,476.0817
3,favourites_count,559696.1
4,listed_count,7155.475


# Train Test Split

In [306]:
from sklearn.model_selection import train_test_split


In [307]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 4)

In [308]:
X_train.shape

(2254, 5)

In [309]:
y

0       0
1       0
2       0
3       0
4       0
       ..
2813    1
2814    1
2815    1
2816    1
2817    1
Name: dataset, Length: 2818, dtype: int64

# Importiung AUC and RUC curve

In [311]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# Modelling

# Random Forrest

In [312]:
from sklearn.ensemble import RandomForestClassifier

In [313]:
rf_model = RandomForestClassifier()

In [314]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [315]:
ytrain_pred = rf_model.predict_proba(X_train)

In [316]:
print('RF train roc-auc: {}'.format(roc_auc_score(y_train,ytrain_pred[:,1])))
ytest_pred_rf = rf_model.predict_proba(X_test)
print('RF test roc-auc: {}'.format(roc_auc_score(y_test,ytest_pred_rf[:,1])))

RF train roc-auc: 1.0
RF test roc-auc: 0.9999684614220115


# Logistic Regression

In [317]:
from sklearn.linear_model import LogisticRegression

In [318]:
log_model = LogisticRegression()

In [319]:
log_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [320]:
ytrain_pred = log_model.predict_proba(X_train)
print('Logistic Regression train roc-auc: {}'.format (roc_auc_score(y_train,ytrain_pred[:,1])))
ytest_pred_log = log_model.predict_proba(X_test)
print('Logistic Regression test roc-auc: {}'.format (roc_auc_score(y_test,ytest_pred_log[:,1])))

Logistic Regression train roc-auc: 0.9976242962668302
Logistic Regression test roc-auc: 0.9993566130090327


# SVM Model

In [321]:
from sklearn.svm import SVC

In [322]:
classifier = SVC(kernel = 'linear', gamma = 'auto', C=2, probability= True )

In [323]:
classifier.fit(X_train,y_train)

SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [264]:
ytrain_pred = classifier.predict_proba(X_train)
print ('SVM train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1] )))
ytest_pred_svm = classifier.predict_proba(X_test)
print ('SVM test roc-auc: {}'.format(roc_auc_score(y_test,ytest_pred_svm[:,1])))

SVM train roc-auc: 0.9975919361827571
SVM test roc-auc: 0.9993566130090326


# KNN Model

In [225]:
from sklearn.neighbors import KNeighborsClassifier

In [226]:
knn = KNeighborsClassifier()

In [227]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [266]:
ytrain_pred = knn.predict_proba(X_train)
print('KNN train roc-auc: {}'.format(roc_auc_score(y_train,ytrain_pred[:,1])))
ytest_pred_knn = knn.predict_proba(X_test)
print('KNN test roc-auc:{}'.format(roc_auc_score(y_test,ytest_pred_knn[:,1])))

KNN train roc-auc: 0.9998046555900466
KNN test roc-auc:0.9930678205581067


# Decision Tree Model

In [231]:
from sklearn import tree

In [234]:
clf = tree.DecisionTreeClassifier(max_depth = 5)

In [235]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [268]:
ytrain_pred = clf.predict_proba(X_train)
print('Decision Tree train roc-auc:{}'.format(roc_auc_score(y_train,ytrain_pred[:,1])))
ytest_pred_clf = clf.predict_proba(X_test)
print('Decision Tree test roc-auc:{}'.format(roc_auc_score(y_test,ytest_pred_clf[:,1])))

Decision Tree train roc-auc:0.9995737940146473
Decision Tree test roc-auc:0.9949601352374224


# Prediction probabilties 

In [274]:
r_probs = [0 for _ in range(len(y_test))]
ytest_pred_rf = rf_model.predict_proba(X_test)
ytest_pred_log = log_model.predict_proba(X_test)
ytest_pred_svm = classifier.predict_proba(X_test)
ytest_pred_knn = knn.predict_proba(X_test)
ytest_pred_clf = clf.predict_proba(X_test)

# Computing AUROC and ROC curve values

In [275]:
from sklearn.metrics import roc_curve, roc_auc_score

In [276]:
r_auc = roc_auc_score(y_test, r_probs)
rf_auc = roc_auc_score(y_test, ytest_pred_rf)

ValueError: bad input shape (564, 2)