In [5]:
import pandas as pd
import gensim
from gensim.models import KeyedVectors

## Test on 10k rows

### Word2Vec

In [6]:
df = pd.read_csv('tokenized_dataframe')

In [100]:
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [13]:
df['vector'] = df['content'].apply(lambda x: model.get_mean_vector(x))

In [18]:
x, y = df['vector'], df['type']

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_nontrain, y_train, y_nontrain = train_test_split(x,y, test_size=0.2)

In [21]:
x_val, x_test, y_val, y_test = train_test_split(x_nontrain, y_nontrain, test_size=0.5)

In [22]:
from sklearn.svm import LinearSVC

In [23]:
y_train = y_train.astype('int64')

In [None]:
y_val = y_val.astype('int64')

In [59]:
import numpy as np

In [69]:
x_train_2d = np.stack(x_train)
x_val_2d = np.stack(x_val)

### SVM

In [74]:
clf = LinearSVC()

In [76]:
from sklearn.model_selection import GridSearchCV

In [120]:
svm_clf = LinearSVC(dual="auto")

svm_parameters = [{
    "C": [0.001, 0.01, 0.1, 1, 10,10000] 
}]

svm_grid_search = GridSearchCV(svm_clf, svm_parameters, cv=3, scoring="accuracy", return_train_score=True)

svm_grid_search.fit(x_train_2d, y_train)

In [121]:
best_svm = svm_grid_search.best_estimator_
best_svm

In [122]:
best_svm.score(x_val_2d,y_val)

0.6977401129943502

### LR

In [96]:
from sklearn.linear_model import LogisticRegression

In [98]:
lr_model = LogisticRegression()

In [101]:
lr_model.fit(x_train_2d, y_train)

In [102]:
lr_model.score(x_val_2d, y_val)

0.6299435028248588

## K nearest neighbors

In [105]:
from sklearn.neighbors import KNeighborsClassifier

In [117]:
n_neigh = KNeighborsClassifier()

parameters = [{
    "n_neighbors": [1, 3, 5, 7, 10], "weights": ["uniform", "distance"]
}]

grid_search = GridSearchCV(n_neigh, parameters, cv=3, scoring="accuracy", return_train_score=True)

grid_search.fit(x_train_2d, y_train)

In [118]:
best_k_nearest = grid_search.best_estimator_
best_k_nearest

In [119]:
best_k_nearest.score(x_val_2d,y_val)

0.6836158192090396

### Gradient Boosting Classifer

In [123]:
from sklearn.ensemble import GradientBoostingClassifier

In [124]:
gbc = GradientBoostingClassifier()

In [125]:
gbc.fit(x_train_2d, y_train)

In [126]:
gbc.score(x_val_2d,y_val)

0.7090395480225988

### Balancing dataset full dataset

In [166]:
df_full = pd.read_csv('full_tokenized_dataframe')

In [168]:
fakes = df_full['type'].value_counts()[1]
print(fakes)

104883


In [169]:
reliables = df_full['type'].value_counts()[0]
print(reliables)

218563


In [170]:
fake_df = df_full[df_full['type'] == 1]

In [171]:
fake_df.shape

(104883, 3)

In [172]:
real_df = df_full[df_full['type'] == 0]

In [173]:
real_df.shape

(218563, 3)

In [174]:
fake_sample = fake_df.sample(n=50000, random_state=111)

In [175]:
fake_sample.shape

(50000, 3)

In [176]:
real_sample = real_df.sample(n=50000, random_state=111)

In [177]:
merged_df = pd.concat([fake_sample, real_sample])

In [178]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,type,content
317330,975290,1,"['main', 'stream', 'media', 'inflat', 'vs', 'd..."
153632,469088,1,"['republican', 'insid', 'wh', 'congress', 'mee..."
50383,145108,1,"['bit', 'coin', 'break', 'overnight', 'coin', ..."
76578,229746,1,"['welcom', 'three', 'percent', 'get', 'tactic'..."
154125,470644,1,"['news', 'dion', 'nasa', 'cassini', 'spacecraf..."


## Same procedure with full balanced dataset (50k each)

In [179]:
merged_df['vector'] = merged_df['content'].apply(lambda x: model.get_mean_vector(x))

In [180]:
x1, y1 = merged_df['vector'], merged_df['type']

In [181]:
x_train1, x_nontrain1, y_train1, y_nontrain1 = train_test_split(x1,y1, test_size=0.2)

In [182]:
x_val1, x_test1, y_val1, y_test1 = train_test_split(x_nontrain1, y_nontrain1, test_size=0.5)

In [183]:
y_train1 = y_train1.astype('int64')

In [184]:
y_val1 = y_val1.astype('int64')

In [185]:
x_train_2d_1 = np.stack(x_train1)
x_val_2d_1 = np.stack(x_val1)

#### SVM

In [137]:
svm_model = LinearSVC(dual="auto")

svm_parameters = [{
    "C": [0.001, 0.01, 0.1, 1, 10,10000] 
}]

svm_hyper = GridSearchCV(svm_model, svm_parameters, cv=3, scoring="accuracy", return_train_score=True)

svm_hyper.fit(x_train_2d_1, y_train1)


KeyboardInterrupt



In [186]:
svm_clf = LinearSVC(C=100)

In [187]:
svm_clf.fit(x_train_2d_1, y_train1)



In [188]:
svm_clf.score(x_val_2d,y_val)

0.635593220338983