In [None]:
# pip install pymysql

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%run /content/drive/MyDrive/Colab\ Notebooks/implementation/packages/Basic.ipynb

In [None]:
import json
import csv
import numpy as np
import pandas as pd

from sklearn.model_selection    import cross_validate,KFold, train_test_split
from sklearn.metrics            import classification_report, confusion_matrix,ConfusionMatrixDisplay,f1_score,recall_score, precision_score, accuracy_score,roc_auc_score
from sklearn.ensemble           import RandomForestClassifier,BaggingClassifier, IsolationForest, ExtraTreesClassifier,AdaBoostClassifier, GradientBoostingClassifier

from sklearn.feature_selection  import mutual_info_classif,f_classif, SelectKBest,RFE,SelectFromModel
from sklearn.metrics import f1_score,recall_score, precision_score, accuracy_score,roc_auc_score


from sklearn.datasets           import make_classification
from sklearn.tree               import DecisionTreeClassifier
from xgboost                    import XGBClassifier
from sklearn.neighbors          import KNeighborsClassifier
from sklearn.naive_bayes        import GaussianNB


from sklearn.preprocessing      import QuantileTransformer


from sklearn.covariance         import EllipticEnvelope
from sklearn.neighbors          import LocalOutlierFactor
from sklearn.svm                import OneClassSVM, SVC



from sklearn.linear_model       import LogisticRegression
from sklearn.pipeline           import Pipeline

import sqlalchemy
from urllib.parse import quote

In [None]:
with open(f'PATH/selectedModelsData.json') as data_file:    
    modelFile = json.load(data_file) 

selectedModels = pd.json_normalize(modelFile,max_level=0)

selectedModels.rename(columns={'Model Name':'modelName'}, inplace=True)

In [None]:
selectedModels.drop(columns=["id"],axis=1,inplace=True)
selectedModels.head(11)

Unnamed: 0,featureNumber,modelName,selectionMethod,selectedFeatures
0,10,et_500,rfe_DecisionTree,"[['description_length', 'favourites_count', 'f..."
1,10,et_100,rfe_DecisionTree,"[['Name_entropy', 'favourites_count', 'favouri..."
2,10,rf_100,rfe_DecisionTree,"[['description_length', 'favourites_count', 'f..."
3,10,rf_500,rfe_DecisionTree,"[['Name_entropy', 'favourites_count', 'favouri..."
4,10,bc_100,rfe_DecisionTree,"[['description_length', 'favourites_count', 'f..."
5,10,xgb,rfe_DecisionTree,"[['Name_similarity', 'description_length', 'fa..."
6,10,et_500,mutual_info_classif,"[['diff_days', 'favourites_count', 'favourites..."
7,10,rf_500,mutual_info_classif,"[['diff_days', 'favourites_count', 'favourites..."
8,10,et_100,sf_RandomForest,"[['diff_days', 'favourites_count', 'favourites..."
9,10,et_500,sf_RandomForest,"[['diff_days', 'favourites_count', 'favourites..."


##VERSION 1

In [None]:
connection = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'.format('USER',quote('PASS'),'IP','DB'))

model_result=["outLierModel","removedSample","featureNumber","DataSet","modelName","selectionMethod","selectedFeatures","Train_Accuracy","Train_F1","Train_Precision","Train_Recall","Train_AUC","Test_Accuracy","Test_F1","Test_Precision","Test_Recall","Test_AUC"]



with open(f'PATH/selectedModelsData.json') as data_file:    
    modelFile = json.load(data_file) 

selectedModels = pd.json_normalize(modelFile,max_level=0)
selectedModels.rename(columns={'Model Name':'modelName'}, inplace=True)


data = pd.read_csv(f'PATH/DS1.csv.gz',compression='gzip',lineterminator='\n');
data=data.sample(frac=1).reset_index(drop=True)
# data=data.loc[0:100,:]


ModelsList={
    "xgb":XGBClassifier(),
    "ab":AdaBoostClassifier(),
    "gn":GaussianNB(),
    "lr":LogisticRegression(solver='lbfgs',max_iter=1000),
    "rf_100":RandomForestClassifier(n_estimators=100),
    "dt":DecisionTreeClassifier(),
    "et_100":ExtraTreesClassifier(n_estimators=100),
    "knn3":KNeighborsClassifier(n_neighbors=3),
    "knn5":KNeighborsClassifier(n_neighbors=5),
    "svc_n":SVC(),
    "bc_100":BaggingClassifier(n_estimators=100),
    "et_500":ExtraTreesClassifier(n_estimators=500),
    "rf_500":RandomForestClassifier(n_estimators=500),
}

datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS16"]
testDataSet={}
for index,dataset in enumerate(datasets):
    testDataSet[dataset] = pd.read_csv(f'/content/drive/MyDrive/DATA/MetaData/Test/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
    testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)


normalizingModels=QuantileTransformer(output_distribution="normal",n_quantiles=100)

outlierMethod={
        "all":"",
        "iso":IsolationForest(contamination=0.1),
        "ee":EllipticEnvelope(contamination=0.01),
        "lof":LocalOutlierFactor(),
        "ncSVM":OneClassSVM(nu=0.01)
}

for index, row in selectedModels.iterrows():
    
    # Converting str to json
    featureList=json.loads(row["selectedFeatures"].replace("'",'"'));

    #find the average number of features for each cross-validation (for 10 features selection the total sometime is 8,9, or 10)
    numberOfFeatues=int(np.mean(list(map(len,featureList))));

    # Find the total number for each unique values
    unique, counts = np.unique(np.hstack(featureList), return_counts=True)
    sortValue=np.asarray((unique, counts)).T

    # select the best average features
    SelectedFeatures=sortValue[np.flip(np.argsort(sortValue[:, 1]))][0:numberOfFeatues,0]

    # print(len(SelectedFeatures),SelectedFeatures)
    X = normalizingModels.fit_transform(data[SelectedFeatures])
    y = data['class'].map({"bot": 1, "human": 0});
    
    for keyOutlier,valueOutlier in outlierMethod.items():

        if keyOutlier!="all":
            outlierPrediction=(valueOutlier.fit_predict(X)!= -1)
            X= X[outlierPrediction]
            y=y[outlierPrediction]

        model = ModelsList[row["modelName"]]
        model.fit(X, y)
        
        for keyTest,dataTest in testDataSet.items():

            testX=dataTest
            testY=testX["class"].map({"bot": 1, "human": 0});
            testX=QuantileTransformer(output_distribution="normal",n_quantiles=50).fit_transform(testX[SelectedFeatures])

            if keyOutlier!="all":
                outlierPrediction=(valueOutlier.fit_predict(testX)!= -1)
                testX= testX[outlierPrediction]
                testY=testY[outlierPrediction]

            pd.DataFrame (
                [[
                    keyOutlier,
                    (len(data)-len(X)),
                    row["featureNumber"],
                    keyTest,
                    row["modelName"],
                    row["selectionMethod"],
                    json.dumps(SelectedFeatures.tolist()),
                    accuracy_score(y, model.predict(X)),
                    f1_score(y, model.predict(X)),
                    precision_score(y, model.predict(X)),
                    recall_score(y, model.predict(X)),
                    roc_auc_score(y, model.predict(X)),
                    accuracy_score(testY, model.predict(testX)),
                    f1_score(testY, model.predict(testX)),
                    precision_score(testY, model.predict(testX)),
                    recall_score(testY, model.predict(testX)),
                    roc_auc_score(testY, model.predict(testX))
                ]],
                columns=model_result
            ).to_sql(con=connection, name='generalization_test_1', if_exists='append',index = False)

##VERSION 2

In [None]:
connection = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'.format('USER',quote('PASS'),'IP','DB'))


ModelsList={
    "xgb":XGBClassifier(),
    "ab":AdaBoostClassifier(),
    "gn":GaussianNB(),
    "lr":LogisticRegression(solver='lbfgs',max_iter=1000),
    "rf_100":RandomForestClassifier(n_estimators=100),
    "dt":DecisionTreeClassifier(),
    "et_100":ExtraTreesClassifier(n_estimators=100),
    "knn3":KNeighborsClassifier(n_neighbors=3),
    "knn5":KNeighborsClassifier(n_neighbors=5),
    "svc_n":SVC(),
    "bc_100":BaggingClassifier(n_estimators=100),
    "et_500":ExtraTreesClassifier(n_estimators=500),
    "rf_500":RandomForestClassifier(n_estimators=500),
}

datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS16"]
testDataSet={}
for index,dataset in enumerate(datasets):
  testDataSet[dataset] = pd.read_csv(f'PATH/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
  testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)

normalizingModels=QuantileTransformer(output_distribution="normal",n_quantiles=100)


for index, row in selectedModels.iterrows():
    
    # Converting str to json
    featureList=json.loads(row["selectedFeatures"].replace("'",'"'));

    #find the average number of features for each cross-validation (for 10 features selection the total sometime is 8,9, or 10)
    numberOfFeatues=int(np.mean(list(map(len,featureList))));

    # Find the total number for each unique values
    unique, counts = np.unique(np.hstack(featureList), return_counts=True)
    sortValue=np.asarray((unique, counts)).T


    # select the best average features
    SelectedFeatures=sortValue[np.flip(np.argsort(sortValue[:, 1]))][0:numberOfFeatues,0]


    X = normalizingModels.fit_transform(data[SelectedFeatures])

    y = data['class'].map({"bot": 1, "human": 0});


    model_result=["featureNumber","DataSet","modelName","selectionMethod","selectedFeatures","Train_Accuracy","Train_F1","Train_Precision","Train_Recall","Train_AUC","Test_Accuracy","Test_F1","Test_Precision","Test_Recall","Test_AUC"]


    model = ModelsList[row["modelName"]]
    model.fit(X, y)
      
    for keyTest,dataTest in testDataSet.items():

        dataTest=dataTest.sample(frac=1).reset_index(drop=True)
        testX=dataTest.loc[0:50,:]
        testY=testX["class"].map({"bot": 1, "human": 0});
        testX=QuantileTransformer(output_distribution="normal",n_quantiles=50).fit_transform(testX[SelectedFeatures])
    

        pd.DataFrame (
            [[
                row["featureNumber"],
                keyTest,
                row["modelName"],
                row["selectionMethod"],
                json.dumps(SelectedFeatures.tolist()),
                accuracy_score(y, model.predict(X)),
                f1_score(y, model.predict(X)),
                precision_score(y, model.predict(X)),
                recall_score(y, model.predict(X)),
                roc_auc_score(y, model.predict(X)),
                accuracy_score(testY, model.predict(testX)),
                f1_score(testY, model.predict(testX)),
                precision_score(testY, model.predict(testX)),
                recall_score(testY, model.predict(testX)),
                roc_auc_score(testY, model.predict(testX))
            ]],
            columns=model_result
        ).to_sql(con=connection, name='generalization_test_1', if_exists='append',index = False)

#VERSION 3

In [None]:
from sklearn.metrics import f1_score,recall_score, precision_score, accuracy_score,roc_auc_score
connection = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'.format('USER',quote('PASS'),'IP','DB'))


ModelsList={
    "xgb":XGBClassifier(),
    "ab":AdaBoostClassifier(),
    "gn":GaussianNB(),
    "lr":LogisticRegression(solver='lbfgs',max_iter=1000),
    "rf_100":RandomForestClassifier(n_estimators=100),
    "dt":DecisionTreeClassifier(),
    "et_100":ExtraTreesClassifier(n_estimators=100),
    "knn3":KNeighborsClassifier(n_neighbors=3),
    "knn5":KNeighborsClassifier(n_neighbors=5),
    "bc_100":BaggingClassifier(n_estimators=100),
    "et_500":ExtraTreesClassifier(n_estimators=500),
    "rf_500":RandomForestClassifier(n_estimators=500),
    "svc_n":SVC(),
}

datasets=["DS4","DS5","DS6","DS7","DS12","DS13","DS14","DS15","DS16"]
testDataSet={}
for index,dataset in enumerate(datasets):
  testDataSet[dataset] = pd.read_csv(f'PATH/{dataset}.csv.gz',compression='gzip',lineterminator='\n');
  testDataSet[dataset]=testDataSet[dataset].sample(frac=1).reset_index(drop=True)

normalizingModels=QuantileTransformer(output_distribution="normal",n_quantiles=len(data))



SelectedFeatures=["Screen_name_freq","Name_similarity","Name_freq","diff_days","name_length","screen_name_length","description_length","num_digits_in_name","Name_entropy","Screen_name_entropy"]
X = normalizingModels.fit_transform(data[SelectedFeatures])
y = data['class'].map({"bot": 1, "human": 0});
model_result=["modelName","featureNumber","DataSet","Train_Accuracy","Train_F1","Train_Precision","Train_Recall","Train_AUC","Test_Accuracy","Test_F1","Test_Precision","Test_Recall","Test_AUC"]

for keyModel,valueModel in ModelsList.items():

    model = valueModel
    model.fit(X, y)
      
    for keyTest,dataTest in testDataSet.items():

        testX=dataTest
        testY=testX["class"].map({"bot": 1, "human": 0});
        testX=QuantileTransformer(output_distribution="normal",n_quantiles=len(testX)).fit_transform(testX[SelectedFeatures])
    

        pd.DataFrame (
            [[
                keyModel,
                keyTest,
                json.dumps(SelectedFeatures),
                accuracy_score(y, model.predict(X)),
                f1_score(y, model.predict(X)),
                precision_score(y, model.predict(X)),
                recall_score(y, model.predict(X)),
                roc_auc_score(y, model.predict(X)),
                accuracy_score(testY, model.predict(testX)),
                f1_score(testY, model.predict(testX)),
                precision_score(testY, model.predict(testX)),
                recall_score(testY, model.predict(testX)),
                roc_auc_score(testY, model.predict(testX))
            ]],
            columns=model_result
        ).to_sql(con=connection, name='test_gn_55', if_exists='append',index = False)