# Simple manual model
- Use EDA findings
    - Long title size
    - Too many special characters
    - Contains click bait
    - Contains first name
    - Contains slang

In [6]:
from freq_utils import *
import regex as re
from collections import Counter


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

pd.options.display.max_colwidth = 200

In [2]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')

df0['label']=0
df1['label']=1

# Train/dev/test split

In [3]:
train, dev, test = train_dev_test_split([df0, df1], m=10000, class_column='label', 
                                        class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)


# Examine classification items

In [5]:
display(train.sample(1))

Index(['org_title', 'lower_title', 'cleaned_words', 'cleaned_pos',
       'minimal_words', 'label'],
      dtype='object')


Unnamed: 0,org_title,lower_title,cleaned_words,cleaned_pos,minimal_words,label
9719,OBAMACARE PRIVACY PRACTICES IN QUESTION AS CRITICS VOICE CONCERNS,obamacare privacy practices in question as critics voice concerns,obamacare privacy practices in question as critics voice concerns,NN NN NNS NN NN IN NNS NN NNS,obamacare privacy practice question critic voice concern,1


In [11]:
for x in df0.columns[:-1]:
    print(x,'=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*')
    words0 = df0[x].str.split().explode().tolist()
    words1 = df1[x].str.split().explode().tolist()
    
    wc = Counter(words0)
    freq_word0 = [x[0] for x in wc.most_common(1000)]
    #print(freq_word0)

    wc = Counter(words1)
    freq_word1 = [x[0] for x in wc.most_common(1000)]
    #print(freq_word1)

    title_only_in_real = []
    for x in freq_word0:
        if not x in freq_word1:
            title_only_in_real.append(x)
        
    title_only_in_fake = []
    for x in freq_word1:
        if not x in freq_word0:
            title_only_in_fake.append(x)

        
    print('Number of title words only in real: ', len(title_only_in_real))
    print(title_only_in_real[:50])

    print('Number of title words only in fake: ', len(title_only_in_fake))
    print(title_only_in_fake[:50])


org_title =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Number of title words only in real:  874
['says', 'over', 'after', 'Korea', "Trump's", 'tax', 'China', 'bill', 'new', 'U.N.', 'vote', 'EU', 'talks', 'deal', 'against', 'court', 'election', 'calls', 'PM', 'South', 'Factbox:', 'minister', 'government', 'Brexit', 'chief', 'police', '-', 'leader', 'president', 'official', 'sanctions', 'plan', 'probe', 'more', 'military', 'nuclear', 'he', 'up', 'ban', 'urges', 'Saudi', 'no', 'may', 'UK', 'campaign', 'presidential', 'law', 'trade', 'German', 'sources']
Number of title words only in fake:  874
['To', 'For', 'Of', 'On', '[Video]', '(VIDEO)', '[VIDEO]', 'And', 'Is', 'With', 'His', 'About', 'Trump’s', 'WATCH:', 'Just', 'By', 'From', 'He', 'Who', 'Over', 'THE', 'This', 'TRUMP', 'Her', 'That', 'Him', 'Are', 'TO', 'Be', 'Will', 'GOP', 'Out', 'OF', '–', 'It', 'Was', 'BREAKING:', 'Has', 'You', 'Black', 'Why', 'OBAMA', 'Up', 'Not', 'Gets', 'Against', 'HILLARY', 'Have', 'Says', 'Their']
lower_title =*=*=*=*=*=

In [12]:
def simple_Manual(test):
    
    y_true = test.label.to_numpy()
    y_pred = []
    
    # title size > 20?
    y_pred.append( test.apply(lambda row: 1 if len(row['lower_title'].split())> 20 else 0, axis=1).to_numpy() )
    # noise > 3 
    y_pred.append( test.apply(lambda row: 1 if len(re.findall(re.compile('[^\s\w]'), row['lower_title'])) > 5 else 0, axis=1).to_numpy() )
    # clickbait, slang, first names
    trigger_word = ['_mytag_slang_',
                    'donald','obama','hillary','bernie']

    for i in range(len(trigger_word)):
        y_pred.append( test.minimal_words.str.contains(trigger_word[i]).to_numpy()*1 )

    cut_name = ['too_long','noisy','clickbait','slang'] + trigger_word[-4:]

    return cut_name, y_true, y_pred

In [13]:
cut_name, y_true, y_pred = simple_Manual(test)

In [14]:

for i in range(len(y_pred)):
    
    #print(type(y_true),type(y_pred[i]))
    print(cut_name[i])
    print('accuracy:',accuracy_score(y_true, y_pred[i]))
    print('precision:',precision_score(y_true, y_pred[i]))
    print('recall:',recall_score(y_true, y_pred[i]))
    print('f1 score:',f1_score(y_true, y_pred[i]))
    print('\n')

too_long
accuracy: 0.542
precision: 1.0
recall: 0.09306930693069307
f1 score: 0.17028985507246375


noisy
accuracy: 0.5875
precision: 0.964824120603015
recall: 0.1900990099009901
f1 score: 0.3176178660049628


clickbait
accuracy: 0.5095
precision: 1.0
recall: 0.028712871287128714
f1 score: 0.055822906641000966


slang
accuracy: 0.5135
precision: 0.9512195121951219
recall: 0.03861386138613861
f1 score: 0.07421503330161751


donald
accuracy: 0.532
precision: 0.7341772151898734
recall: 0.11485148514851486
f1 score: 0.19863013698630136


obama
accuracy: 0.541
precision: 0.9791666666666666
recall: 0.09306930693069307
f1 score: 0.16998191681735986


hillary
accuracy: 0.502
precision: 0.9375
recall: 0.01485148514851485
f1 score: 0.02923976608187134




# Conclusion

- Accuracy is not better than random selection in any case, but precisions are high.