# Instagram Bot Classification

## Import Libraries

In [1]:
import glob, os, json, time
import numpy as np
import networkx as nx
import pandas as pd
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
from tqdm import  tqdm
from scipy import spatial
from InstaScrape.Config.config import *
from Feature_Extraction.Feature_Extraction import get_the_models
from Network.Create_network import extract_network_features

## Get the data

In [62]:
df = pd.read_csv("labels.csv", index_col=0)

df

Unnamed: 0,labels
skills_and_growth,Inorganic
amazingdecorjakarta,Organic
dr.thema,Organic
shrinkchicks,Inorganic
_birb_memes_,Inorganic
...,...
energialifestyles,Inorganic
thediaryofagoddess,Inorganic
think.geography,Inorganic
microbiologylifevilnius,Inorganic


## Functions

In [3]:
def cosine_similarity(mylist):

    if len(mylist) > 1:
        similarities = []
        for i in range(len(mylist)-1):
            for j in range(i+1, len(mylist)):
                similarity = 1 - spatial.distance.cosine(mylist[i], mylist[j])
                similarities.append(similarity)
        similarities = np.array(similarities)

        return {"cos_sim_mean": np.mean(similarities), "cos_sim_var": np.var(similarities) }
    else:
        return {"cos_sim_mean": 0, "cos_sim_var": 0}

In [4]:
def get_mean_embeddings(mylist):
    mylist = np.array(mylist)

    return np.mean(mylist,axis=0)

In [5]:
def getmetadata(acc_p, log_features = ["posts", "followers", "following"]):
    """
    Returns a dictionary that holds specific information about #posts,
    #followers, #followings, isverified or not for each account.
    """
    mydict_ = {}
    name_ = os.path.basename(acc_p)

    with open(acc_p+"/{}.json".format(name_)) as myfile:
        opened = json.load(myfile)

        for fea in log_features:
            mydict_[fea] = opened[fea]
            mydict_[f"{fea}_log"] = np.log1p(opened[fea])

        mydict_["verified"] = opened["verified"]
    return mydict_

In [25]:
def to_df_dict(mydict):

    df_dict = {}

    for account_name in mydict.keys():
        ghost_dict = {}
        for sec_key in mydict[account_name].keys():
            for fea in  mydict[account_name][sec_key].keys():
                if sec_key == "network":
                    for net_fea in mydict[account_name][sec_key][fea].keys():
                        ghost_dict[f"{fea}_{net_fea}"] = mydict[account_name][sec_key][fea][net_fea]
                else:
                    ghost_dict[fea] = mydict[account_name][sec_key][fea]
        df_dict[account_name] = ghost_dict

    return df_dict

In [7]:
def executePCA(df_temp, num_of_fea=100):
    """
    Finds the PCA components of willing feature vector.
    """

    model_ = PCA(n_components=num_of_fea, svd_solver="arpack")
    df_temp = model_.fit_transform(df_temp)
    return df_temp

In [8]:
def expand_cols(dataframe, cols=["image_embedding", "caption_embedding"], pca_pars = [150,100]):
    df_ = dataframe.copy()

    for indx, col in enumerate(cols):
        arr = np.vstack(df_[col])
        arr = executePCA(arr, pca_pars[indx])
        df_temp = pd.DataFrame(arr, index=df_.index, columns=[f"{col}_{x}" for x in range(arr.shape[1])])
        df_ = df_.merge(df_temp, left_index=True, right_index=True)

    df_.drop(cols, axis=1, inplace=True)
    return df_


In [9]:
def normalize_columns(df_trn, df_tst, sca_type= "MinMaxScaler"):
    """
    Normalizes all numeric columns by using MinMaxScaler

    :param df_trn: DataFrame
    :param df_tst: DataFrame
    :return: DataFrame, DataFrame
    """

    if sca_type == "MinMaxScaler":
        scaler = MinMaxScaler()

    else:
        scaler = StandardScaler()

    df_train = df_trn.copy()
    df_test = df_tst.copy()
    numeric_cols = df_train.dtypes.index[((df_train.dtypes != "object") & (df_train.dtypes != bool))]

    ct = ColumnTransformer([('SS', scaler, numeric_cols)], remainder='passthrough')
    df_train[numeric_cols] = ct.fit_transform(df_train[numeric_cols])
    df_test[numeric_cols] = ct.transform(df_test[numeric_cols])

    return df_train, df_test


In [10]:
def convert_bools(dataframe):
    """
    Converts all boolean and string True/False elements to 1 and 0.
    True: 1
    False: 0

    :param dataframe: DataFrame
    :return: DataFrame
    """

    df = dataframe.copy()
    none_int_cols = df.dtypes.index[(df.dtypes != "int64") == True]
    df[none_int_cols] = df[none_int_cols].applymap(lambda x: 0 if x == "False" or x == False else x)
    df[none_int_cols] = df[none_int_cols].applymap(lambda x: 1 if x == "True" or x == True else x)

    return df

## Main

In [None]:
pbar = tqdm(glob.glob(f"{DATA_PATH_}/*"))
mynetworks = ["comment", "hashtag", "tag"]
feature_dict ={}

for folder_path in pbar:
    acc_name = os.path.basename(folder_path)

    files = glob.glob(f"{folder_path}/[!{acc_name},!features]*.json")
    meta_file = f"{folder_path}/{acc_name}.json"
    image_text_features = f"{folder_path}/features.json"

    image_embeddings = []
    caption_embeddings = []
    comment_similarities = []


    # IMAGE AND TEXT FEATURES
    with open(image_text_features) as features_file:
        data = json.load(features_file)

        for post in data:
            pbar.set_description(f"Collecting Features of {acc_name} => {post}: ")


            image_embedding =  data[post]["image_features"]
            caption_embedding = data[post]["text_features"]["caption"]
            comment_embedding = data[post]["text_features"]["comments"]
            comment_embedding = [np.array(x) for x in comment_embedding]

            comments_cos_sim = cosine_similarity(comment_embedding)

            image_embeddings.append(image_embedding)
            caption_embeddings.append(caption_embedding)
            comment_similarities.append(comments_cos_sim["cos_sim_mean"])

    # NETWORK FEATURES
    network_files = [nx.readwrite.gexf.read_gexf(f"{folder_path}/{acc_name}_{network}_network.gexf") for network in mynetworks]


    feature_dict[acc_name] = {"image": {"image_embedding": get_mean_embeddings(image_embeddings),
                                    "image_cos_sim_mean": cosine_similarity(image_embeddings)["cos_sim_mean"],
                                    "image_cos_sim_var": cosine_similarity(image_embeddings)["cos_sim_var"]},
                              "text": {"caption_embedding": get_mean_embeddings(caption_embeddings),
                                    "caption_cos_sim_mean": cosine_similarity(caption_embeddings)["cos_sim_mean"],
                                    "caption_cos_sim_var":cosine_similarity(caption_embeddings)["cos_sim_var"],
                                    "comments_cos_sim_mean": cosine_similarity(comment_similarities)["cos_sim_mean"]},
                             "network": {"comment_network": extract_network_features(network_files[0]),
                                   "hashtag_network":extract_network_features(network_files[1]),
                                   "tag_network":extract_network_features(network_files[2])},
                              "metadata": getmetadata(folder_path)
                             }

In [63]:
df_raw = pd.DataFrame.from_dict(to_df_dict(feature_dict), orient="index")

In [64]:
df_raw = expand_cols(df_raw)

In [65]:
df_raw.fillna(0, inplace=True)

In [66]:
df = pd.merge(df_raw, df, left_index=True, right_index=True)

### Train-Test Split

In [68]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state= 42)


le = LabelEncoder()
le.fit(df["labels"]).classes_

array(['Inorganic', 'Organic'], dtype=object)

In [69]:
df["labels"] = df["labels"].apply(lambda x: 0 if x == "Inorganic" else 1)

In [70]:
y = df["labels"]
X = df.drop("labels", axis=1)

In [74]:
# Parameter
param = {'max_depth': 30,
         'eta': 0.001,
         'objective': 'binary:hinge',
         "nthread" : 4,
         "eval_metric" : "auc"}

num_round = 800

In [75]:
# Stratified train-test split
f1_scores = []

for train_index, test_index in sss.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = pd.DataFrame(y).iloc[train_index], pd.DataFrame(y).iloc[test_index]

    oversample = SMOTE()
    X_train, y_train = oversample.fit_resample(X_train,y_train)
    X_train, X_test = normalize_columns(X_train, X_test)
    X_train = convert_bools(X_train)
    X_test = convert_bools(X_test)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(param, dtrain, num_round, evallist)
    mypreds = bst.predict(dtest)
    f1_scores.append(f1_score(y_test, mypreds, average="binary"))

[0]	eval-auc:0.50000	train-auc:0.50000
[1]	eval-auc:0.50000	train-auc:0.50000
[2]	eval-auc:0.50000	train-auc:0.50000
[3]	eval-auc:0.50000	train-auc:0.50000
[4]	eval-auc:0.50000	train-auc:0.50000
[5]	eval-auc:0.50000	train-auc:0.50000
[6]	eval-auc:0.50000	train-auc:0.50000
[7]	eval-auc:0.50000	train-auc:0.50000
[8]	eval-auc:0.50000	train-auc:0.50000
[9]	eval-auc:0.50000	train-auc:0.50000
[10]	eval-auc:0.50000	train-auc:0.50000
[11]	eval-auc:0.50000	train-auc:0.50000
[12]	eval-auc:0.50000	train-auc:0.50000
[13]	eval-auc:0.50000	train-auc:0.50000
[14]	eval-auc:0.50000	train-auc:0.50000
[15]	eval-auc:0.50000	train-auc:0.50000
[16]	eval-auc:0.50000	train-auc:0.50000
[17]	eval-auc:0.50000	train-auc:0.50000
[18]	eval-auc:0.50000	train-auc:0.50000
[19]	eval-auc:0.50000	train-auc:0.50000
[20]	eval-auc:0.50000	train-auc:0.50000
[21]	eval-auc:0.50000	train-auc:0.50000
[22]	eval-auc:0.50000	train-auc:0.50000
[23]	eval-auc:0.50000	train-auc:0.50000
[24]	eval-auc:0.50000	train-auc:0.50000
[25]	eval-

In [76]:
f1_scores

[0.09999999999999999,
 0.186046511627907,
 0.2631578947368421,
 0.2926829268292683,
 0.3255813953488372,
 0.21276595744680848,
 0.3333333333333333,
 0.39999999999999997,
 0.3043478260869565,
 0.22727272727272727]