In [None]:
import json
import csv
import numpy as np
import pandas as pd

from sklearn.model_selection    import cross_validate,KFold, train_test_split
from sklearn.metrics            import classification_report, confusion_matrix,ConfusionMatrixDisplay,f1_score,recall_score, precision_score, accuracy_score,roc_auc_score
from sklearn.ensemble           import RandomForestClassifier,BaggingClassifier, IsolationForest, ExtraTreesClassifier,AdaBoostClassifier, GradientBoostingClassifier

from sklearn.feature_selection  import mutual_info_classif,f_classif, SelectKBest,RFE,SelectFromModel

from sklearn.datasets           import make_classification
from sklearn.tree               import DecisionTreeClassifier
from xgboost                    import XGBClassifier
from sklearn.neighbors          import KNeighborsClassifier
from sklearn.naive_bayes        import GaussianNB

from sklearn.preprocessing      import QuantileTransformer

from sklearn.covariance         import EllipticEnvelope
from sklearn.neighbors          import LocalOutlierFactor
from sklearn.svm                import OneClassSVM, SVC

from sklearn.linear_model       import LogisticRegression
from sklearn.pipeline           import Pipeline

import sqlalchemy
from urllib.parse import quote

from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SMOTE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ModelsList={
    "xgb":XGBClassifier(),
    "ab":AdaBoostClassifier(),
    "bc_100":BaggingClassifier(n_estimators=100),
    "dt":DecisionTreeClassifier(),
    "et_100":ExtraTreesClassifier(n_estimators=100),
    "knn3":KNeighborsClassifier(n_neighbors=3),
    "rf_100":RandomForestClassifier(n_estimators=100),
    "svc_n":SVC(),
    "gn":GaussianNB(),
    "lr":LogisticRegression(solver='lbfgs',max_iter=1000),
    "knn5":KNeighborsClassifier(n_neighbors=5),
    "rf_500":RandomForestClassifier(n_estimators=500),
    "et_500":ExtraTreesClassifier(n_estimators=500),
}

In [None]:
model_result=[
    "modelName",
    "samplingModel",
    "outlierModel",
    "trainDataSet",
    "testDataSet",
    "selectedFeatures",

    "Train_Accuracy",
    "Train_F1",
    "Train_Precision",
    "Train_Recall",
    "Train_AUC",

    "Test_Accuracy",
    "Test_F1",
    "Test_Precision",
    "Test_Recall",
    "Test_AUC"
]
SelectedFeatures=[
    "favourites_count","followers_count","favourites_growth_rate","followers_friends_ratio","friends_growth_rate",
    "statuses_count","listed_count","friends_count","followers_growth_rate","verified",
    "tweet_freq","listed_growth_rate","Screen_name_freq","name_length","description_length",
    "screen_name_length","default_profile","profile_use_background_image","num_digits_in_screen_name","num_digits_in_name"
]

In [None]:
def evaluating_model(nm,model,X, y):

    pipe=Pipeline(steps=[('sc',nm),('model', model)])

    
    scores = cross_validate(
        pipe,
        X,
        y,
        cv=KFold(n_splits=5, random_state=1, shuffle=True),
        scoring=["accuracy",'f1','recall','roc_auc','precision'],
        return_train_score=True,
        error_score='raise',
        return_estimator=True
    )

    ffData=pd.DataFrame(scores).drop('estimator', axis=1)
    
    return scores["estimator"],ffData, ffData.agg({
      'fit_time':'sum', 
      'score_time':'sum',
      'test_accuracy':'mean',
      'train_accuracy':'mean',
      'test_f1':'mean',
      'train_f1':'mean',
      'test_recall':'mean',
      'train_recall':'mean',
      'test_roc_auc':'mean',
      'train_roc_auc':'mean',
      'test_precision':'mean',
      'train_precision':'mean',
    })

In [None]:
samplingMethod={
    "all":"",

}
outlierMethod={
    "all":"",
}


data = pd.read_csv(f'PATH/Final/DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)


datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS15","DS16"]
testDataSet={}
for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'PATH/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)


list_result=list()

testX=data
testY=testX["class"].map({"bot": 1, "human": 0});
testX=QuantileTransformer(output_distribution="normal",n_quantiles=len(testX)).fit_transform(testX[SelectedFeatures])
testX, testY = SMOTEENN().fit_resample(testX, testY)

for keyModel,valueModel in ModelsList.items():

    for keySampling,valueSampling in samplingMethod.items():

        for keyOutlier,valueOutlier in outlierMethod.items():

            for keyTestMain,dataTestMain in testDataSet.items():

                y=dataTestMain["class"].map({"bot": 1, "human": 0})
                X=QuantileTransformer(
                    output_distribution="normal",
                    n_quantiles=len(dataTestMain)
                ).fit_transform(dataTestMain[SelectedFeatures])


                model = valueModel
                model.fit(X, y)

                list_result.append([
                        keyModel,
                        keySampling,
                        keyOutlier,
                        keyTestMain,
                        "all",
                        json.dumps(SelectedFeatures),
                        accuracy_score(y, model.predict(X)),
                        f1_score(y, model.predict(X)),
                        precision_score(y, model.predict(X)),
                        recall_score(y, model.predict(X)),
                        roc_auc_score(y, model.predict(X)),
                        accuracy_score(testY, model.predict(testX)),
                        f1_score(testY, model.predict(testX)),
                        precision_score(testY, model.predict(testX)),
                        recall_score(testY, model.predict(testX)),
                        roc_auc_score(testY, model.predict(testX))
                ])
                print(keyModel,keyTestMain)

result_dataframe=pd.DataFrame(list_result,columns=model_result)

result_dataframe.to_csv('PATH/Final/generalization_train_datasetss_test_our_all_sample_20_features.csv', mode='a', header=False,index=False)

In [None]:
samplingMethod={
    "all":"",

}
outlierMethod={
    "all":"",
}

data = pd.read_csv(f'PATH/Final/t-DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)


datasets=["DS20"]
testDataSet={}
for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'PATH/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)


list_result=list()

testX=data
testY=testX["class"].map({"bot": 1, "human": 0});
testX=QuantileTransformer(output_distribution="normal",n_quantiles=len(testX)).fit_transform(testX[SelectedFeatures])

for keyModel,valueModel in ModelsList.items():

    for keySampling,valueSampling in samplingMethod.items():

        for keyOutlier,valueOutlier in outlierMethod.items():

            for keyTestMain,dataTestMain in testDataSet.items():

                y=dataTestMain["class"].map({"bot": 1, "human": 0})
                X=QuantileTransformer(
                    output_distribution="normal",
                    n_quantiles=len(dataTestMain)
                ).fit_transform(dataTestMain[SelectedFeatures])


                model = valueModel
                model.fit(X, y)
                
               

                list_result.append([
                        keyModel,
                        keySampling,
                        keyOutlier,
                        keyTestMain,
                        "all",
                        json.dumps(SelectedFeatures),
                        accuracy_score(y, model.predict(X)),
                        f1_score(y, model.predict(X)),
                        precision_score(y, model.predict(X)),
                        recall_score(y, model.predict(X)),
                        roc_auc_score(y, model.predict(X)),
                        accuracy_score(testY, model.predict(testX)),
                        f1_score(testY, model.predict(testX)),
                        precision_score(testY, model.predict(testX)),
                        recall_score(testY, model.predict(testX)),
                        roc_auc_score(testY, model.predict(testX))
                ])
                print(keyModel,keyTestMain)

result_dataframe=pd.DataFrame(list_result,columns=model_result)

result_dataframe.to_csv('PATH/Final/gen_t_DS1_sample_20_features_ex1.csv', mode='a', header=False,index=False)

dt DS20
et_100 DS20
knn3 DS20


In [None]:
data = pd.read_csv(f'PATH/Final/t-DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)


datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS15","DS16"]
testDataSet={}
testDataSetX={}
testDataSetY={}
for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'PATH/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)


    testDataSetY[dataset]=testDataSet[dataset]["class"].map({"bot": 1, "human": 0})
    testDataSetX[dataset]=QuantileTransformer(output_distribution="normal",n_quantiles=len(testDataSet[dataset])).fit_transform(testDataSet[dataset][SelectedFeatures])

    testDataSetX[dataset], testDataSetY[dataset] = SMOTEENN().fit_resample(testDataSetX[dataset], testDataSetY[dataset])

list_result=list()

testX=data
testY=testX["class"].map({"bot": 1, "human": 0});
testX=QuantileTransformer(output_distribution="normal",n_quantiles=len(testX)).fit_transform(testX[SelectedFeatures])
testX, testY = SMOTEENN().fit_resample(testX, testY)

for keyModel,valueModel in ModelsList.items():
    for testDBKey in testDataSetX.keys() & testDataSetY.keys():
        model = valueModel
        model.fit(testDataSetX[testDBKey], testDataSetY[testDBKey])
        list_result.append([
                keyModel,
                "SMOTEENN",
                "all",
                testDBKey,
                "all",
                json.dumps(SelectedFeatures),
                accuracy_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                f1_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                precision_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                recall_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                roc_auc_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                accuracy_score(testY, model.predict(testX)),
                f1_score(testY, model.predict(testX)),
                precision_score(testY, model.predict(testX)),
                recall_score(testY, model.predict(testX)),
                roc_auc_score(testY, model.predict(testX))
        ])
        print(keyModel,testDBKey)

result_dataframe=pd.DataFrame(list_result,columns=model_result)

result_dataframe.to_csv('PATH/Final/gen_t_DS1_sample_20_features_ex2.csv', mode='a', header=False,index=False)

dt DS20
et_100 DS20
knn3 DS20


In [None]:
data = pd.read_csv(f'PATH/Final/t-DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)


datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS15","DS16","DS20"]

testDataSet={}
testDataSetX={}
testDataSetY={}
for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'PATH/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)


    testDataSetY[dataset]=testDataSet[dataset]["class"].map({"bot": 1, "human": 0})
    testDataSetX[dataset]=QuantileTransformer(output_distribution="normal",n_quantiles=len(testDataSet[dataset])).fit_transform(testDataSet[dataset][SelectedFeatures])

    testDataSetX[dataset], testDataSetY[dataset] = SMOTEENN().fit_resample(testDataSetX[dataset], testDataSetY[dataset])

list_result=list()

testX=data
testY=testX["class"].map({"bot": 1, "human": 0});
testX=QuantileTransformer(output_distribution="normal",n_quantiles=len(testX)).fit_transform(testX[SelectedFeatures])
testX, testY = SMOTEENN().fit_resample(testX, testY)

for keyModel,valueModel in ModelsList.items():
    for testDBKey in testDataSetX.keys() & testDataSetY.keys():
        model = valueModel
        model.fit(testDataSetX[testDBKey], testDataSetY[testDBKey])
        list_result.append([
                keyModel,
                "SMOTEENN",
                "all",
                testDBKey,
                "all",
                json.dumps(SelectedFeatures),
                accuracy_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                f1_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                precision_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                recall_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                roc_auc_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                accuracy_score(testY, model.predict(testX)),
                f1_score(testY, model.predict(testX)),
                precision_score(testY, model.predict(testX)),
                recall_score(testY, model.predict(testX)),
                roc_auc_score(testY, model.predict(testX))
        ])
        print(keyModel,testDBKey)

result_dataframe=pd.DataFrame(list_result,columns=model_result)

result_dataframe.to_csv('PATH/Final/gen_t_DS1_sample_20_features_ex3.csv', mode='a', header=False,index=False)

dt DS20
et_100 DS20
knn3 DS20


In [None]:
data = pd.read_csv(f'PATH/Final/t-DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)


datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS15","DS16","DS20"]
testDataSet={}
testDataSetX={}
testDataSetY={}
for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'PATH/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)


    testDataSetY[dataset]=testDataSet[dataset]["class"].map({"bot": 1, "human": 0})
    testDataSetX[dataset]=QuantileTransformer(output_distribution="normal",n_quantiles=len(testDataSet[dataset])).fit_transform(testDataSet[dataset][SelectedFeatures])

    testDataSetX[dataset], testDataSetY[dataset] = SMOTEENN().fit_resample(testDataSetX[dataset], testDataSetY[dataset])

list_result=list()

trainX=data
trainY=trainX["class"].map({"bot": 1, "human": 0});
trainX=QuantileTransformer(output_distribution="normal",n_quantiles=100000).fit_transform(trainX[SelectedFeatures])
trainX, trainY = SMOTEENN().fit_resample(trainX, trainY)

for keyModel,valueModel in ModelsList.items():
    for testDBKey in testDataSetX.keys() & testDataSetY.keys():

        model = valueModel
        model.fit(trainX, trainY)

        list_result.append([
                keyModel,
                "SMOTEENN",
                "all",
                "t-DS1",
                testDBKey,
                json.dumps(SelectedFeatures),

                accuracy_score(trainY, model.predict(trainX)),
                f1_score(trainY, model.predict(trainX)),
                precision_score(trainY, model.predict(trainX)),
                recall_score(trainY, model.predict(trainX)),
                roc_auc_score(trainY, model.predict(trainX)),

                accuracy_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                f1_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                precision_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                recall_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
                roc_auc_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
        ])
        print(keyModel,testDBKey)

result_dataframe=pd.DataFrame(list_result,columns=model_result)

result_dataframe.to_csv('PATH/Final/gen_t_DS1_sample_20_features_ex4.csv', mode='a', header=False,index=False)

dt DS6
dt DS15
dt DS20
dt DS12
dt DS5
dt DS13
dt DS4
dt DS14
dt DS16
dt DS7
et_100 DS6
et_100 DS15
et_100 DS20
et_100 DS12
et_100 DS5
et_100 DS13
et_100 DS4
et_100 DS14
et_100 DS16
et_100 DS7
knn3 DS6
knn3 DS15
knn3 DS20
knn3 DS12
knn3 DS5
knn3 DS13
knn3 DS4
knn3 DS14
knn3 DS16
knn3 DS7


In [None]:
data = pd.read_csv(f'PATH/Final/t-DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)


datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS15","DS16","DS20"]
testDataSet={}
testDataSetX={}
testDataSetY={}
for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'PATH/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)


    testDataSetY[dataset]=testDataSet[dataset]["class"].map({"bot": 1, "human": 0})
    testDataSetX[dataset]=QuantileTransformer(output_distribution="normal",n_quantiles=len(testDataSet[dataset])).fit_transform(testDataSet[dataset][SelectedFeatures])

    testDataSetX[dataset], testDataSetY[dataset] = SMOTEENN().fit_resample(testDataSetX[dataset], testDataSetY[dataset])

list_result=list()

trainX=data
trainY=trainX["class"].map({"bot": 1, "human": 0});
trainX=QuantileTransformer(output_distribution="normal",n_quantiles=100000).fit_transform(trainX[SelectedFeatures])
trainX, trainY = SMOTEENN().fit_resample(trainX, trainY)

for keyModel,valueModel in ModelsList.items():

    model = valueModel
    model.fit(trainX, trainY)

    trainPerformance=[
        accuracy_score(trainY, model.predict(trainX)),
        f1_score(trainY, model.predict(trainX)),
        precision_score(trainY, model.predict(trainX)),
        recall_score(trainY, model.predict(trainX)),
        roc_auc_score(trainY, model.predict(trainX)),
    ]


    for testDBKey in testDataSetX.keys() & testDataSetY.keys():
        list_result.append([
            keyModel,
            "SMOTEENN",
            "all",
            "t-DS1",
            testDBKey,
            json.dumps(SelectedFeatures)
        ]+trainPerformance+ [
            accuracy_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            f1_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            precision_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            recall_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            roc_auc_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
        ])
        print(keyModel,testDBKey)

result_dataframe=pd.DataFrame(list_result,columns=model_result)

result_dataframe.to_csv('PATH/Final/gen_t_DS1_sample_20_features_ex5.csv', mode='a', header=False,index=False)

In [None]:
data = pd.read_csv(f'PATH/Final/t-DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)


datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS15","DS16","DS20"]
testDataSet={}
testDataSetX={}
testDataSetY={}

for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'PATH/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)
    testDataSetY[dataset]=testDataSet[dataset]["class"].map({"bot": 1, "human": 0})
    testDataSetX[dataset]=QuantileTransformer(output_distribution="normal",n_quantiles=len(testDataSet[dataset])).fit_transform(testDataSet[dataset][SelectedFeatures])


list_result=list()

trainX=data
trainY=trainX["class"].map({"bot": 1, "human": 0});
trainX=QuantileTransformer(output_distribution="normal",n_quantiles=100000).fit_transform(trainX[SelectedFeatures])


for keyModel,valueModel in ModelsList.items():

    model = valueModel
    model.fit(trainX, trainY)

    trainPerformance=[
        accuracy_score(trainY, model.predict(trainX)),
        f1_score(trainY, model.predict(trainX)),
        precision_score(trainY, model.predict(trainX)),
        recall_score(trainY, model.predict(trainX)),
        roc_auc_score(trainY, model.predict(trainX)),
    ]

    for testDBKey in testDataSetX.keys() & testDataSetY.keys():

       

        list_result.append([
            keyModel,
            "all",
            "all",
            "t-DS1",
            testDBKey,
            json.dumps(SelectedFeatures)
        ]+trainPerformance+ [
            accuracy_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            f1_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            precision_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            recall_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
            roc_auc_score(testDataSetY[testDBKey], model.predict(testDataSetX[testDBKey])),
        ])
        print(keyModel,testDBKey)

result_dataframe=pd.DataFrame(list_result,columns=model_result)

result_dataframe.to_csv('PATH/Final/gen_t_DS1_sample_20_features_ex61.csv', mode='a', header=False,index=False)

rf_100 DS16
rf_100 DS5
rf_100 DS20
rf_100 DS6
rf_100 DS14
rf_100 DS7
rf_100 DS12
rf_100 DS4
rf_100 DS13
rf_100 DS15
