## Fake instagram account Prediction

In [1]:
#load the libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

In [2]:
data = pd.read_csv('dataset1.csv', sep=';')

In [3]:
data.fake.value_counts()

0    288
1    288
Name: fake, dtype: int64

In [4]:
data.head()

Unnamed: 0,profilePic,lengthUsername,fullnameWords,lengthFullname,username,descriptionLength,externalURL,private,posts,followers,follows,fake
0,1,0.27,0,0.0,0,53,0,0,32,1000,955,0
1,1,0.0,2,0.0,0,44,0,0,286,2740,533,0
2,1,0.1,2,0.0,0,0,0,1,13,159,98,0
3,1,0.0,1,0.0,0,82,0,0,679,414,651,0
4,1,0.0,2,0.0,0,0,0,1,6,151,126,0


### Data Preprocessing

In [5]:
data = data.drop_duplicates()

In [6]:
missing_values = data.isnull().sum()
print(missing_values)

profilePic           0
lengthUsername       0
fullnameWords        0
lengthFullname       0
username             0
descriptionLength    0
externalURL          0
private              0
posts                0
followers            0
follows              0
fake                 0
dtype: int64


In [7]:
data.fake.value_counts()

0    287
1    287
Name: fake, dtype: int64

In [8]:
y = data['fake']
x = data.drop('fake', axis = 1)
columns = x.columns

scaler = StandardScaler()
scaler = scaler.fit(x)
X = scaler.transform(x)
features = pd.DataFrame(X, columns = columns)

In [9]:
features.head()

Unnamed: 0,profilePic,lengthUsername,fullnameWords,lengthFullname,username,descriptionLength,externalURL,private,posts,followers,follows
0,0.654111,0.505808,-1.386316,-0.289267,-0.190003,0.805681,-0.363524,-0.788333,-0.1876,-0.092867,0.485583
1,0.654111,-0.768414,0.512838,-0.289267,-0.190003,0.567012,-0.363524,-0.788333,0.443721,-0.090957,0.026159
2,0.654111,-0.29648,0.512838,-0.289267,-0.190003,-0.599814,-0.363524,1.268499,-0.234825,-0.09379,-0.447419
3,0.654111,-0.768414,-0.436739,-0.289267,-0.190003,1.574726,-0.363524,-0.788333,1.420528,-0.09351,0.154623
4,0.654111,-0.768414,0.512838,-0.289267,-0.190003,-0.599814,-0.363524,1.268499,-0.252223,-0.093799,-0.416936


### Modelling

In [19]:
x_train, x_test, y_train, y_test = train_test_split(features, y, test_size = 0.3, random_state = 42)

In [20]:
model = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93        94
           1       0.92      0.90      0.91        79

    accuracy                           0.92       173
   macro avg       0.92      0.92      0.92       173
weighted avg       0.92      0.92      0.92       173



In [22]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test, y_pred)
print(accuracy)

0.9190751445086706


### Implementing HummingBird Algorithm  to optimize the model

In [23]:
def objective_function(x):
    # Define the objective function that you want to optimize, accuracy/precision/recall...
    model = RandomForestClassifier(n_estimators=x, bootstrap = True, max_features = 'sqrt')
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy=accuracy_score(y_test, y_pred)
    return accuracy

In [24]:
import random
def hummingbird_algorithm(objective_function, bounds, population_size, max_iterations):
    # Initialize population
    population = np.random.uniform(bounds[0], bounds[1], (population_size, 1)) #generate random numbers between bound[0] and...
    best_solution = None #it is the best population
    best_fitness = float('inf') #best fitness is greater than number in the 1st iteration
    for iteration in range(max_iterations):
        # Flapping wing motion and nectar source update
        for i in range(population_size): 
            # Flapping wing motion is used to generate new solutions
            numberEstimators = int(population[i])
            new_solution = numberEstimators + random.randint(bounds[0],bounds[1]) #adding a random perturbation to the solution
            new_fitness = objective_function(new_solution)
            # Nectar source update is used to update the population 
            if new_fitness < objective_function(numberEstimators):
                population[i] = new_solution
            # Update best solution
            if new_fitness < best_fitness: 
                best_solution = new_solution
                best_fitness = new_fitness
    return best_solution, best_fitness

In [25]:
bounds = [1, 20] #the search space of the algorithm
population_size = 10 #number of hummingBirds
max_iterations = 20 #the number of times the population will be updated
best_solution, best_fitness = hummingbird_algorithm(objective_function, bounds, population_size, max_iterations)
print("Best solution:", best_solution)
print("Best fitness:", best_fitness)

Best solution: 3
Best fitness: 0.8554913294797688
