In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

In [4]:
fake = pd.read_csv('misinfo_data/DataSet_Misinfo_FAKE.csv', index_col=0)
true = pd.read_csv('misinfo_data/DataSet_Misinfo_TRUE.csv', index_col=0)
true = true.dropna()
corpus = pd.concat([fake, true], ignore_index=True)
y = np.concatenate([np.ones(fake.shape[0]), np.zeros(true.shape[0])])

In [5]:
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(corpus['text'])
transformer = TfidfTransformer()
X = transformer.fit_transform(count_matrix)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
max_depth = [int(x) for x in np.linspace(10, 100, num = 4)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 15, cv = 3, verbose=3, random_state=10, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV 1/3] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=800;, score=0.779 total time= 5.1min
[CV 2/3] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=800;, score=0.780 total time= 5.1min
[CV 3/3] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=800;, score=0.778 total time= 5.2min
[CV 1/3] END bootstrap=False, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.896 total time= 7.9min
[CV 2/3] END bootstrap=False, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.903 total time= 7.9min
[CV 3/3] END bootstrap=False, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.904 total time= 8.0min
[CV 1/3] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=400;, score=0.906 total time=1



[CV 2/3] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=800;, score=0.921 total time=58.5min
[CV 1/3] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.774 total time= 1.7min
[CV 3/3] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=800;, score=0.926 total time=58.9min
[CV 2/3] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.779 total time= 1.8min
[CV 3/3] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.787 total time= 1.7min
[CV 1/3] END bootstrap=True, max_depth=100, min_samples_leaf=4, min_samples_split=5, n_estimators=1000;, score=0.905 total time=30.1min
[CV 2/3] END bootstrap=True, max_depth=100, min_samples_leaf=4, min_samples_split=5, n_estimators=1000;, score=0.911 total time=29.6min
[CV 3/3] END bootstrap=True, max_depth=100, min_sam

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
param_grid = {
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

rf = RandomForestClassifier( n_estimators=200, max_depth = None)
rf_grid = GridSearchCV(estimator = rf, param_grid=param_grid, cv = 3, verbose=3, n_jobs = -1)
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=10;, score=0.915 total time=14.3min
[CV 2/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=10;, score=0.918 total time=14.3min
[CV 1/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=5;, score=0.916 total time=16.2min
[CV 2/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=5;, score=0.918 total time=16.4min
[CV 3/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=5;, score=0.922 total time=16.6min
[CV 1/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=2;, score=0.917 total time=19.7min
[CV 3/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=2;, score=0.921 total time=19.7min
[CV 2/3] END bootstrap=True, min_samples_leaf=1, min_samples_split=2;, score=0.918 total time=19.8min
[CV 1/3] END bootstrap=True, min_samples_leaf=2, min_samples_split=2;, score=0.913 total time=11.0min
[CV 2/3] END bootst

In [20]:
print(rf_grid.best_estimator_)
print(rf_grid.best_score_)
rf = rf_grid.best_estimator_
rf.fit(X_train, y_train)
y_hat = rf.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_hat))


RandomForestClassifier(bootstrap=False, min_samples_split=5, n_estimators=200)
0.9225496683123802
0.934493815849748
