In [None]:
!pip install tensorflow
!pip install tensorflow_text
!pip install sklearn

In [None]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub
from tqdm import tqdm

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

os.chdir("/content/gdrive/MyDrive/Colab Notebooks/data")
# !ls

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/data/custom_covid.csv")

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
df["text"] = df["text"]
df["legit"] = df["legit"]
df = df[["text", "legit"]]

fake_tweets = df[df.legit == 0]
real_tweets = df[df.legit == 1]

print(real_tweets.shape, fake_tweets.shape)

fake_df = fake_tweets
real_df = real_tweets

(15, 2) (15, 2)


In [None]:
tweet_df = pd.concat([real_df,fake_df]).reset_index(drop=True)
tweet_df.shape

(30, 2)

In [None]:
from sklearn.preprocessing import OneHotEncoder

type_one_hot = OneHotEncoder(sparse=False).fit_transform(
  tweet_df.legit.to_numpy().reshape(-1, 1)
)

In [None]:
train_tweets, test_tweets, y_train, y_test =\
  train_test_split(
    tweet_df.text, 
    type_one_hot, 
    test_size=.1, 
    random_state=RANDOM_SEED
  )

In [None]:
X_train = []
for r in tqdm(train_tweets):
  emb = use(r)
  review_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(review_emb)

X_train = np.array(X_train)

data_in = X_train

100%|██████████| 27/27 [00:11<00:00,  2.39it/s]


In [None]:
X_test = []
for r in tqdm(test_tweets):
  emb = use(r)
  review_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(review_emb)

X_test = np.array(X_test)

100%|██████████| 3/3 [00:00<00:00,  6.84it/s]


In [None]:
extratrees = ExtraTreesClassifier().fit(X_train,y_train)
select = SelectFromModel(extratrees, prefit=True)
data_in_new = select.transform(data_in)

print(data_in.shape, data_in_new.shape)

(27, 512) (27, 206)


In [None]:
from sklearn.ensemble import RandomForestClassifier
train_tweets, test_tweets, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(train_tweets, y_train)

RandomForestClassifier(n_estimators=50)

In [None]:
print("Accuracy: ", classif.score(test_tweets,y_test)*100)

Accuracy:  66.66666666666666


In [None]:
from sklearn.metrics import confusion_matrix

result = classif.predict(test_tweets)
conf_mat = confusion_matrix(y_test.argmax(axis=1), result.argmax(axis=1))

In [None]:
conf_mat

array([[1, 0],
       [1, 1]])

In [None]:
print("False positives: ", conf_mat[0][1]/sum(conf_mat[0])*100)
print("False negatives: ", conf_mat[1][0]/sum(conf_mat[1])*100)

False positives:  0.0
False negatives:  50.0
