# Simple manual model
- Use EDA findings
    - Long title size
    - Too many special characters
    - Contains click bait
    - Contains first name
    - Contains slang

In [1]:
from freq_utils import *
import regex as re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

pd.options.display.max_colwidth = 200

In [2]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')

df0['label']=0
df1['label']=1

df0.drop(['title','pos','cleaned_pos'],axis=1,inplace=True)
df1.drop(['title','pos','cleaned_pos'],axis=1,inplace=True)

# Train/dev/test split

In [3]:
train, dev, test = train_dev_test_split([df0, df1], m=40000, class_column='label', 
                                        class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)


In [4]:
display(train.sample(1))

Unnamed: 0,cleaned_words,minimal_words,org_title,lower_title,label
219,satanists brilliantly troll anti gay baker by requesting a birthday cake for lucifer _mytag_parentheses_,satanist brilliantly troll anti gay baker request birthday cake lucifer _mytag_parentheses_,Satanists Brilliantly Troll Anti-Gay Baker By Requesting A Birthday Cake For Lucifer (VIDEO),satanists brilliantly troll anti-gay baker by requesting a birthday cake for lucifer (video),1


In [5]:
def simple_Manual(test):
    
    y_true = test.label.to_numpy()
    y_pred = []
    
    # title size > 20?
    y_pred.append( test.apply(lambda row: 1 if len(row['lower_title'].split())> 20 else 0, axis=1).to_numpy() )
    # noise > 3 
    y_pred.append( test.apply(lambda row: 1 if len(re.findall(re.compile('[^\s\w]'), row['lower_title'])) > 5 else 0, axis=1).to_numpy() )
    # clickbait, slang, first names
    trigger_word = ['_mytag_parentheses_', '_mytag_slang_',
                    'donald','obama','hillary','bernie']

    for i in range(len(trigger_word)):
        y_pred.append( test.minimal_words.str.contains(trigger_word[i]).to_numpy()*1 )

    cut_name = ['too_long','noisy','clickbait','slang'] + trigger_word[-4:]

    return cut_name, y_true, y_pred

In [6]:
cut_name, y_true, y_pred = simple_Manual(test)

In [7]:

for i in range(len(y_pred)):
    
    #print(type(y_true),type(y_pred[i]))
    print(cut_name[i])
    print('accuracy:',accuracy_score(y_true, y_pred[i]))
    print('precision:',precision_score(y_true, y_pred[i]))
    print('recall:',recall_score(y_true, y_pred[i]))
    print('f1 score:',f1_score(y_true, y_pred[i]))
    print('\n')

too_long
accuracy: 0.55025
precision: 1.0
recall: 0.09484276729559749
f1 score: 0.17325367647058823


noisy
accuracy: 0.585875
precision: 0.9378306878306878
recall: 0.17836477987421384
f1 score: 0.299725216656098


clickbait
accuracy: 0.690875
precision: 1.0
recall: 0.3778616352201258
f1 score: 0.5484754427606353


slang
accuracy: 0.519
precision: 1.0
recall: 0.031949685534591196
f1 score: 0.06192101413944418


donald
accuracy: 0.521625
precision: 0.9457831325301205
recall: 0.03949685534591195
f1 score: 0.07582709490461241


obama
accuracy: 0.54575
precision: 0.7651632970451011
recall: 0.12377358490566037
f1 score: 0.21307925508878303


hillary
accuracy: 0.549625
precision: 0.979381443298969
recall: 0.09559748427672957
f1 score: 0.17419206967682788


bernie
accuracy: 0.509625
precision: 0.9642857142857143
recall: 0.013584905660377358
f1 score: 0.026792359216075413




# Conclusion

- Accuracy is not better than random selection in any case, but precisions are high.