# Organize with EDA
I'm taking a look only for title, but eventually, both title and text will be used.
- For future: mind dots (handle dots without hurting sentence tokenization) in text
- Ignore comment about text for now. 

In [1]:
from freq_utils import *
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.help import upenn_tagset
from nltk.tokenize import TreebankWordTokenizer

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, RegexpParser
from nltk.corpus import stopwords, wordnet

import regex as re

from collections import Counter
import time

pd.options.display.max_colwidth = 200

In [2]:
df0_org = pd.read_csv('data/True.csv')
df1_org = pd.read_csv('data/Fake.csv')

In [3]:
# Drop data we don't use (from eda_raw.ipynb)
df0_org.drop(['text','subject','date'], axis=1, inplace=True)
df1_org.drop(['text','subject','date'], axis=1, inplace=True)

df0_org.drop_duplicates()
df1_org.drop_duplicates()

df1_org = df1_org[df1_org.title.str.split().str.len()>2]
#df0 = df0[df0.text.str.split().str.len()>19]

In [4]:
# To compare modification result
df0 = df0_org
df1 = df1_org

In [5]:
def print_sentences_with_this_string(this_string, column_to_look, df_list, df_names, 
                                     print_words=False, print_set=False, sent_token=False):
    
    n_dataFrame = len(df_list)
    
    pat = re.compile(this_string)
    
    set_list = []
    
    for i in range(n_dataFrame):
        df = df_list[i]
        df = df[df[column_to_look].str.contains(this_string, regex= True, na=False)]
        
        count = df[column_to_look].count()
        
        
        print(this_string,'in',column_to_look,'\n',df_names[i],':',count)
        
        if count==0:
            continue
        
        
        if print_set:
            df = df.sample(min(len(df),1000), random_state=20)
        else:
            df = df.sample(min(len(df),20), random_state=20)
        
        corpus_list = df[column_to_look].to_numpy()
        index_list  = df.index.to_numpy()
                
        example_df = pd.DataFrame(columns=['index','selected_text','selected_words'])
        
      
        for row in range(len(index_list)):
            
            if sent_token:
                sentences = sent_tokenize(corpus_list[row])
            
                display_text = ''
                display_word = []
            
                for sentence in sentences:
                
                    if pat.search(sentence):
                        display_text += sentence+' '
                        display_word += pat.findall(sentence)
                                        
                example_df.loc[row] = [index_list[row],display_text,display_word]
            else:
                if pat.search(corpus_list[row]):
               
                    display_text = corpus_list[row]
                    display_word = pat.findall(display_text)
                    example_df.loc[row] = [index_list[row],display_text,display_word]
                
            
            
        example_df.set_index('index')
        
        if print_set:    
            #word_set = set()
            word_set = list()
            
            lst_list = list(example_df.selected_words)
            
            for lst in lst_list:
                word_set += lst
                
            #print(word_set)
            
            word_counter = Counter(word_set)
            print(word_counter.most_common(200))
        

        if not print_words:
            example_df.drop(['selected_words'], axis=1, inplace=True)


        
        display(example_df.sample(min(len(df),20), random_state=20))
        
        if print_set:
            set_list.append(word_set)
            
    if print_set:
        return set_list

# Digital source

In [6]:
%%script false --no-raise-error


print_sentences_with_this_string('[^\s]*[@]+[^\s]*',        
                                 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

print_sentences_with_this_string('[^\s]*//[^\s]+[.][^\s]+', 
                                 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings
- Overall, **fake news quote digital sources much more frequently** than real news. 

#### @ in title
- Real news: only one news has it, when its **topic is about social media account**
- Fake news: some are about **social media account**, but some are **slangs** (used like "*")

#### @ in text
- Both real and fake news have @ to **refer social media accounts**.
- 20 times **more frequently** used in fake news

#### website in text
- **Few real news** contains the website address in this dataset.
- **A lot of fake news** contains website address. Examples of them were CNN news, Facebook, and YouTube address.

### Processing
- There are only a few rows, so let's simply change **@ to _**

### Replace them to tags

In [7]:
df0 = df0.replace(to_replace='@', value='_mytag_at_', regex=True, inplace=False)
df1 = df1.replace(to_replace='@', value='_mytag_at_', regex=True, inplace=False)

# Slang

In [8]:
%%script false --no-raise-error


print_sentences_with_this_string('[^\s]*[\*]+[^\s]*', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings
- No real news has * in titles.
- Some **fake news** have * in **title** to display **slangs**.
- Both real and fake news have **\* in texts**, 14 times **frequently occur in fake news**.
- When * is used **in the text**, it is **not always for slangs** (e.g. to emphasize). 
- It is not **hard to separate** usage of star between **slang and highlighting** based on text pattern.
- I'll mark both of those works as `_mytag_slang_` since they have a common meaning and function, **highlighting, anyway.

### Processing
- Tag words contain * as **slang** (only for title)

In [9]:
# replace works with *
df0.replace(to_replace='[^\s]*[\*]+[^\s]*', value='_mytag_slang_', regex=True, inplace=True)
df1.replace(to_replace='[^\s]*[\*]+[^\s]*', value='_mytag_slang_', regex=True, inplace=True)

# Other special characters
As seen from the slang character tagging, some special character replaces an alphabet character, therefore, blindly removing all special characters might leave some words meaningless.
Let's check how the other special characters are used.

In [10]:
%%script false --no-raise-error

sc_title = print_sentences_with_this_string('[^\s\w]', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [11]:
%%script false --no-raise-error

c0 = Counter(sc_title[0])
c1 = Counter(sc_title[1])

print(c0)
print(c1)

print(set([x[0] for x in c0.most_common(5)]))


sc_fake_only = set([x[0] for x in c1]) - set([x[0] for x in c0.most_common(5)])


print(sc_fake_only)

### Findings
Fake news are more noisy having more kind of special characters. Special characters not used in real news might have a special function. Let's take a look.

In [12]:
%%script false --no-raise-error

for x in c0:
    
    #sc_regex = '[\\' + x + ']'
    sc_regex = '[^\s]*[\\' + x + '][^\s]*'

    print(x)
    print_sentences_with_this_string(sc_regex, 'title', [df0, df1], ['Real','Fake'], print_words=True, print_set=True)

In [13]:
%%script false --no-raise-error

for x in sc_fake_only:
    
    #sc_regex = '[\\' + x + ']'
    sc_regex = '[^\s]*[\\' + x + '][^\s]*'

    print(x)
    print_sentences_with_this_string(sc_regex, 'title', [df1], ['Fake'], print_words=True, print_set=True)

### Findings
Usages
- \# : hashtag, tv show episode, website address 
- % : percent, not used in real news, interestingly
- -- (longer than a hyphen) : hyphen, some slang but ignorable
- ! : exclamation, not used in real news, but remove it in case real news happen to have an exclamation mark is classified as fake
- [] (): clickbait, emphasis
- } : seems a typo of ] in case of title, in text, it looks like a script. Token with } in text better be removed.
- & : and or special words (e.g. Q&A, AT&T)
- \$ : dollor or slang
- \/ : 9/11, 24/7, or clickbait (e.g. video/image)

### Note for processings

- \/ : **replace to a space*
- [] () {} : **remove with enclosed text** to avoid a strong bias of this dataset
- : : **replace to a space** if it is between two numbers (time), **replace to ;** otherwise 
- ;, ... : **replace to \.**
- Abc. (abbreviation has one dot at the end): **remove dot**

Tokenize sentence. Then

- — : **replace to -** then do the same as below
- \- : **leave it** if it is hyphen (between two words without space), **remove** otherwise
- \$ : **remove** if followed by a number, **replace to \_** otherwise (slangs)
- \& : **replace to "and"** if spaced, **replace to \_ otherwise"
- \% : **replace to " percent "**

- \# : remove (words are either special noun or number)
- !, ?, , : remove
- \" : remove

Tokenize word. Then
- \' : remove
- handle abbreviation

# Capital letters

In [14]:
%%script false --no-raise-error

print_sentences_with_this_string('[\s^\w][A-Z][^\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings
As from EDA, words with capicalized first characters are proper nouns. Name entity recognition would recognize some of them (e.g. "Trump"), but some woudn't (e.g. "White", "House"). However, bigram or trigram would catch such case.

In [15]:
%%script false --no-raise-error

title0 = df0.sample(1000, random_state=9).title.tolist()
title1 = df1.sample(1000, random_state=9).title.tolist()

# Make a bigram list
# Subtract trigram
bi_real = ngram_tokenizer(title0, n=2)
bi_fake = ngram_tokenizer(title1, n=2)
tr_real = ngram_tokenizer(title0, n=3)
tr_fake = ngram_tokenizer(title1, n=3)
qd_real = ngram_tokenizer(title0, n=4)
qd_fake = ngram_tokenizer(title1, n=4)

def combine_uppercase_words(lst):

    rg = re.compile('[A-Z]')
    
    co = Counter(lst)  
    ngram_lst  = [x[0] for x in list(co.most_common(100))]
    
    lst_combine=[]

    for x in ngram_lst:
        # x: bigram or trigram tuple
        ngram = len(x)
        
        if set([bool(rg.search(x[i][0])) for i in range(ngram)])=={True}:
            lst_combine.append(x)
    
    return ngram_lst, lst_combine
            
print(combine_uppercase_words(bi_real),'\n')            
print(combine_uppercase_words(tr_real),'\n')            
print(combine_uppercase_words(qd_real),'\n')            
print(combine_uppercase_words(bi_fake),'\n')            
print(combine_uppercase_words(tr_fake),'\n')            
print(combine_uppercase_words(qd_fake),'\n') 

bi = combine_uppercase_words(bi_real)[1]
tr = combine_uppercase_words(tr_real)[1]

for t in tr:
    for b in bi:
        if set(b).issubset(set(t)):
            print(b, t)

### Note for preprocessings
- Combine \<Capital start\> + \<Capical start\> words (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong)

# Abbreviation

In [16]:
%%script false --no-raise-error

# US, USA, UN, UK... -> Add two "_" at the end
print_sentences_with_this_string('[A-Z][A-Z]+[\.]?', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [17]:
%%script false --no-raise-error

# Abbreviation with a space between, like U. S.?
print_sentences_with_this_string('[A-Z][\w]*[.][\s][A-Z][\w]*[.]', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [18]:
%%script false --no-raise-error

# Abbreviation with lower cases?
print_sentences_with_this_string('[a-z][\w]*[.][a-z][\w]*[.][^\s]*', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [19]:
%%script false --no-raise-error

# U.S., Dr.
print_sentences_with_this_string('[A-Z][\w]*[\.][\w\.]*', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [20]:
%%script false --no-raise-error

# single character words
print_sentences_with_this_string('[\s]+[A-Z][\.][\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [21]:
%%script false --no-raise-error

# two character words
print_sentences_with_this_string('[\s]+[A-Z][\.]?[A-Z][\.]?[\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [22]:
%%script false --no-raise-error

# Words have only one dot at the end
print_sentences_with_this_string('[A-Z][\w]+[\.][\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [23]:
%%script false --no-raise-error

# Check other dot examples
print_sentences_with_this_string('[^\s]*[\.][^\s]*', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings

- The dot in any abbreviation will interrupt sentence tokenization.
    - Abbreviation at the end of a sentence
        - 'I'm in the **U.S.** That is a news!' tokenized into **one** sentence.
        - 'I'm in the **Dept.** That is a news!' tokenized into **two** sentences.
    - Abbreviation in the middle of a sentence
        - 'I'm in the **U.S.** now!' tokenized into **one** sentence.
        - 'I'm in the **Dept.** now!' tokenized into **two** sentences.

- Due to **irregular capitalization/formatting rules in fake news**, distinguish these words relying on text format **without context** seems **impossible**.
- We can ignore possible spaces between abbreviation because they didn't happen (e.g. U. S.).
- May (month) vs may (modal verb) vs May (name) is hard to distinguish. I'll leave it up to the learning a context.

Here is a note about most frequent words.


#### Bigram/Trigram
- Some words start with upper case should be combined to have meaning (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong)
- No point to use fake news bi/trigram to replace Capital+Capical words because it is full of noise.
- For 'House', 'Speaker' and 'Ryan', 'Ryan' should be separated.
- For now, let's leave it up to learning

#### Examples of words that should be recognized as a same word
- **US, U.S, U.S. or U.S.A.**: variation of the United States, comes from fake news or typo. 

#### Examples of words that should be recognized as distinct words
- **PM (Prime Minister)**, **P.M. (Post Meridiem)**, and **p.m. (post meridiem)**
- **IS (Islamic State)** and **is (be verb)**
- **No. (number)** and **no (opposite of yes)**
- **IT (information technology) ** and **it (pronoun)**

#### Single or two characters words
- They can be removed by **stop word** removal. 
- Most of **single letters** for middle name except **N. in North Korea**. It's ok to drop middle names.
- For **two characters** words, add extra \_ at the end in order not to be disappeared.

#### Words have only one dot at the end
- These words can distort sentence tokenization.
- They are followed by a proper noun (**Dr., Sen., Jr.**), which is ok to be **removed**, or name of month (**Jul.**). Either case, **removing dot** would be enough.

### Note for preprocessings
- Change **N. Korea**, **N.Korea** to **North Korea**
- Combine \<Capital start\> + \<Capical start\> words (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong)
- Change abbreviations
    - U.N. : \_u_n_
    - Rep. : Rep
    - Sept. : Sep
    - Sen. : Sen
    - Gov. : Gov

# Preprocessings

#### 1. Least interfering preprocessings

1. Replace N. Korea, N.Korea to North Korea (frequently occuring topic)
2. Remove single letter capical word (e.g. middle name)
3. Abbreviation
    - Remove a dot at the end of a Month word starts with an upper case. (Jul. -> Jul)
    - No. to number
    - U.N. : \_u_n_
    - Rep. : Rep
    - Sept. : Sep
    - Sen. : Sen
    - Gov. : Gov
    - PM : \_p_m_
    - P.M. (Post Meridiem), and p.m. (post meridiem): \_mytag_pm_ (same for a.m.)
    - US, U.S, U.S. or U.S.A.: \_u_s_
4. Special characters 
    - / : a space
    - [] () {} : \_mytag_parentheses_
    - $ : remove (not care about a few usages for slang)
    - & : replace to "and" if spaced, replace to an underbar otherwise
    - % : replace to " percent "
    - \# : replace to an underbar  (words are either special noun or number)
5. Special characters (after abbreviation handling)
    - ... : a space
    - : : ~replace to a space if it is between two numbers (time), replace to . otherwise~ leave it to tokenizer
    - ; : ~replace to \.~ leave it to tokenizer


#### ~2. After sentence tokenization~ No sentenct tokenization for title
1. ~Bigram: words that have different meaning if used alone: Combine \<Capital start\> + \<Capical start\> words (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong) with an underbar, keep capicalization~ leave it to learning
        


#### 3. After word tokenization
1. Special characters
    - — (en dash? em dash?): replace to - (hyphen) then do the same as below
    - \- : replace it to an underbar if it is hyphen (between two words without space), remove otherwise
    - !, ?, , : remove
    - \" : remove
2. Abbreviation: AB, A.B -> \_a_b_


#### 4. After PoS tagging
    - \' : remove    
    - Uncapitalization

<Text items >
<Remove news id (location, reuters) >
<Replace month abbreviations>
<july, jul., Jul, Jul., July : \_mytag_month_july_ >
<May: \_mytag_month_may_ >


## Least interfering preprocessings
Includes handlings for better sentence tokenization

In [24]:
def replace_regex(old_exp, new_exp='', verbose=True):
    
    if verbose:
        print_sentences_with_this_string(old_exp, 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)
    
    if not new_exp=='':
        df0.replace(to_replace=old_exp, value=new_exp, regex=True, inplace=True)
        df1.replace(to_replace=old_exp, value=new_exp, regex=True, inplace=True)
        if verbose:
            print(old_exp,'replaced to',new_exp)
            print_sentences_with_this_string(new_exp, 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)


In [25]:
# Replace N. Korea, N.Korea to North Korea
old_exp = '(?:[\s]|^)[N][\.][\s]?Korea'
new_exp = ' North Korea'
replace_regex(old_exp, new_exp, verbose=False)

In [26]:
# Remove single letter capical word (e.g. middle name)
old_exp = '(?:[\s]|^)[A-Z][\.](?:[\s]|$)'
new_exp = ' '
replace_regex(old_exp, new_exp, verbose=False)


In [27]:
# No. to No
# U.N. : _u_n_
# Rep. : Rep
# Sept. : Sep
# Sen. : Sen
# Gov. : Gov
# PM : \_p_m_
# **P.M. (Post Meridiem)**, and **p.m. (post meridiem)**: \_mytag_pm_ (same for a.m.)
# **US, U.S, U.S. or U.S.A.**: \_u_s_

      
#old_exp = '(?:[\s]|^)[N][o][\.]'
#new_exp = ' No '
old_exp = '(?:[\s]|^)[N][o][\.]'
new_exp = ' No. '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[U][\.]?[N][\.]?(?:[\s]|$)'
#new_exp = ' _u_n_ '
new_exp = ' UN '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Rr][Ee][Pp][\.]'
new_exp = ' Rep '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ss][Ee][Pp][Tt][\.]'
new_exp = ' Sept '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ss][Ee][Nn][\.]'
new_exp = ' Sen '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Gg][Oo][Vv][\.]'
new_exp = ' Gov '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[P][M](?:[\s]|$)'
#new_exp = ' _p_m_ '
new_exp = ' PM '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Pp][\.][Mm][\.](?:[\s]|$)'
#new_exp = ' _mytag_pm_ '
new_exp = ' p.m. '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Aa][\.][Mm][\.](?:[\s]|$)'
#new_exp = ' _mytag_am_ '
new_exp = ' a.m. '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '(?:[\s]|^)[U][\.]?[S][\.]?[A]?[\.]?(?:[\s]|$)'
#new_exp = ' _u_s_ '
new_exp = ' U.S. '
replace_regex(old_exp,new_exp, verbose=False)  

In [28]:
# Remove a dot at the end of a word starts with an upper case.
old_exp = '(?:[\s]|^)[Jj][Aa][Nn][\.]'
new_exp = ' Jan '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ff][Ee][Bb][\.]'
new_exp = ' Feb '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Mm][Aa][Rr][\.]'
new_exp = ' Mar '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Pp][Rr][\.]'
new_exp = ' Apr '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Jj][Uu][Nn][\.]'
new_exp = ' Jun '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Jj][Uu][Ll][\.]'
new_exp = ' Jul '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Aa][Uu][Gg][\.]'
new_exp = ' Aug '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ss][Ee][Pp][\.]'
new_exp = ' Sep '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Oo][Cc][Tt][\.]'
new_exp = ' Oct '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Nn][Oo][Vv][\.]'
new_exp = ' Nov '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Dd][Ee][Cc][\.]'
new_exp = ' Dec '
replace_regex(old_exp, new_exp, verbose=False)

In [29]:
# / : a space
# $ : **remove** (not care about a few usages for slang)
# % : **replace to " percent "**
# # : **replace to an underbar**  (to tag proper noun, words are either special noun or number)
# & : **replace to "and"** if spaced, **replace to an underbar otherwise"

# [] () {} : \_mytag_parentheses_


old_exp = '[\/]'
new_exp = ' '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\$]'
new_exp = ' '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\%]'
new_exp = ' percent '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\#]'
new_exp = '_'
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\s][\&][\s]'
new_exp = ' and '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\&]'
new_exp = '_'
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\[\{\(][\s]?[\w]+[\s]?[\]\}\)]'
new_exp = ' _mytag_parentheses_ '
replace_regex(old_exp,new_exp, verbose=False)

In [30]:
# dot dot dot
old_exp = '[\.][\.]+'
new_exp = ' '
replace_regex(old_exp, new_exp, verbose=False)

## 2. Word tokenize and PoS Tagging

In [31]:
# Abbreviation: AB, A.B, A.B. -> _a_b_ already done for frequent words

def tokenizer(corpus, verbose=False):

    tb_tokenizer = TreebankWordTokenizer()
    
    words = tb_tokenizer.tokenize(corpus)
    
    pos = pos_tag(words)
        
    return pos
    

df0['pos'] = df0.title.apply(tokenizer)
df1['pos'] = df1.title.apply(tokenizer)

In [32]:
# For Naive Bayse model

c = df1.loc[10]

#print(c.title,'\n', c.pos)

def convert_pos(pos_only):
    
    #word_pos = pos_tag([word])

    tag = ''
    try:
        tag = pos_only[:2]
    except:
        tag = 'n'
    
    if tag == 'JJ':
        tag = 'a'
    elif tag == 'NN':
        tag = 'n'
    elif tag == 'RB':
        tag = 'r'
    elif tag == 'VB':
        tag = 'v'
    else:
        tag = 'n'
        
    return tag

def gen_organized_column(pos_tag_series):
    
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    rgx = re.compile('[\w]+[\'\w+]?|[\:\;\!\?\.]')
    
    col_words = []
    col_minimal_words = []
    col_pos = []

    for pos_tag_row in pos_tag_series:
        
        words_list = []
        minimal_words_list = []
        pos_list = []

        for pair in pos_tag_row:
    
            token_list = rgx.findall(pair[0].lower())
            pos = pair[1]
    
            # skip special character token
            if not bool(token_list):
                continue
            
            
            token = ' '.join(token_list)
            
            #print(token,pos)

            words_list.append(token)
            pos_list.append(pos)
            
            
            # Minimal words
            token_list = [x for x in token_list if not x in stop_words and len(x)>2]
            
            
            if bool(token_list):
                token = ' '.join(token_list)
                token = token.lower()
                minimal_words_list.append(lemmatizer.lemmatize(token, convert_pos(pos)))
        
        words_list = ' '.join(words_list)
        minimal_words_list = ' '.join(minimal_words_list)
        pos_list = ' '.join(pos_list)
        
        col_words.append(words_list)
        col_minimal_words.append(minimal_words_list)
        col_pos.append(pos_list)
        
    if not len(pos_tag_series)==len(col_words) or \
        not len(pos_tag_series)==len(col_minimal_words) or \
        not len(pos_tag_series)==len(col_pos):
        
        return 'Error: array length does not match'
    else:
        return  pd.Series(col_words, index=pos_tag_series.index), \
                pd.Series(col_pos, index=pos_tag_series.index), \
                pd.Series(col_minimal_words, index=pos_tag_series.index)
                

In [33]:
col_words0, col_pos0, col_minimal_words0 = gen_organized_column(df0.pos)
col_words1, col_pos1, col_minimal_words1 = gen_organized_column(df1.pos)

df0['org_title'] = df0_org.title
df1['org_title'] = df1_org.title

df0['lower_title'] = df0_org.title.str.lower()
df1['lower_title'] = df1_org.title.str.lower()

df0['cleaned_words'] = col_words0
df0['cleaned_pos'] = col_pos0
df0['minimal_words'] = col_minimal_words0

df1['cleaned_words'] = col_words1
df1['cleaned_pos'] = col_pos1
df1['minimal_words'] = col_minimal_words1



In [34]:
display(df0.sample(10))
display(df1.sample(10))


Unnamed: 0,title,pos,cleaned_words,cleaned_pos,minimal_words,org_title,lower_title
12812,Libya says pushing to be removed from Trump travel ban list,"[(Libya, NNP), (says, VBZ), (pushing, VBG), (to, TO), (be, VB), (removed, VBN), (from, IN), (Trump, NNP), (travel, NN), (ban, NN), (list, NN)]",libya says pushing to be removed from trump travel ban list,NNP VBZ VBG TO VB VBN IN NNP NN NN NN,libya say push remove trump travel ban list,Libya says pushing to be removed from Trump travel ban list,libya says pushing to be removed from trump travel ban list
18156,Catalan pro-independence parties working on independence declaration: El Mundo,"[(Catalan, NNP), (pro-independence, NN), (parties, NNS), (working, VBG), (on, IN), (independence, NN), (declaration, NN), (:, :), (El, NNP), (Mundo, NNP)]",catalan pro independence parties working on independence declaration : el mundo,NNP NN NNS VBG IN NN NN : NNP NNP,catalan pro independence party work independence declaration mundo,Catalan pro-independence parties working on independence declaration: El Mundo,catalan pro-independence parties working on independence declaration: el mundo
10972,Civil rights activist DeRay McKesson running for mayor of Baltimore,"[(Civil, NNP), (rights, NNS), (activist, NN), (DeRay, NNP), (McKesson, NNP), (running, VBG), (for, IN), (mayor, NN), (of, IN), (Baltimore, NNP)]",civil rights activist deray mckesson running for mayor of baltimore,NNP NNS NN NNP NNP VBG IN NN IN NNP,civil right activist deray mckesson run mayor baltimore,Civil rights activist DeRay McKesson running for mayor of Baltimore,civil rights activist deray mckesson running for mayor of baltimore
10022,Bill Clinton defends wife's 'super predator' comment to protesters,"[(Bill, NNP), (Clinton, NNP), (defends, VBZ), (wife, NN), ('s, POS), ('super, JJ), (predator, NN), (', POS), (comment, NN), (to, TO), (protesters, NNS)]",bill clinton defends wife s super predator comment to protesters,NNP NNP VBZ NN POS JJ NN NN TO NNS,bill clinton defend wife super predator comment protester,Bill Clinton defends wife's 'super predator' comment to protesters,bill clinton defends wife's 'super predator' comment to protesters
6932,AmCham in China says new U.S. administration needs to get up to speed quickly,"[(AmCham, NNP), (in, IN), (China, NNP), (says, VBZ), (new, JJ), (U.S., NNP), (administration, NN), (needs, VBZ), (to, TO), (get, VB), (up, RP), (to, TO), (speed, VB), (quickly, RB)]",amcham in china says new u . s . administration needs to get up to speed quickly,NNP IN NNP VBZ JJ NNP NN VBZ TO VB RP TO VB RB,amcham china say new administration need get speed quickly,AmCham in China says new U.S. administration needs to get up to speed quickly,amcham in china says new u.s. administration needs to get up to speed quickly
3583,Russians discussed how to influence Trump via his aides: NYT,"[(Russians, NNS), (discussed, VBD), (how, WRB), (to, TO), (influence, VB), (Trump, NNP), (via, IN), (his, PRP$), (aides, NNS), (:, :), (NYT, NN)]",russians discussed how to influence trump via his aides : nyt,NNS VBD WRB TO VB NNP IN PRP$ NNS : NN,russian discuss influence trump via aide nyt,Russians discussed how to influence Trump via his aides: NYT,russians discussed how to influence trump via his aides: nyt
16174,Liberia's ruling party backs challenge to presidential result,"[(Liberia, NNP), ('s, POS), (ruling, VBG), (party, NN), (backs, NNS), (challenge, VBP), (to, TO), (presidential, JJ), (result, NN)]",liberia s ruling party backs challenge to presidential result,NNP POS VBG NN NNS VBP TO JJ NN,liberia rule party back challenge presidential result,Liberia's ruling party backs challenge to presidential result,liberia's ruling party backs challenge to presidential result
8068,"Biden warns Ukraine on reforms, says EU sanctions on Russia at risk","[(Biden, NNP), (warns, VBZ), (Ukraine, NNP), (on, IN), (reforms, NNS), (,, ,), (says, VBZ), (EU, NNP), (sanctions, NNS), (on, IN), (Russia, NNP), (at, IN), (risk, NN)]",biden warns ukraine on reforms says eu sanctions on russia at risk,NNP VBZ NNP IN NNS VBZ NNP NNS IN NNP IN NN,biden warn ukraine reform say sanction russia risk,"Biden warns Ukraine on reforms, says EU sanctions on Russia at risk","biden warns ukraine on reforms, says eu sanctions on russia at risk"
240,Former Trump adviser interviewed in Congress in Russia probe,"[(Former, JJ), (Trump, NNP), (adviser, NN), (interviewed, VBD), (in, IN), (Congress, NNP), (in, IN), (Russia, NNP), (probe, NN)]",former trump adviser interviewed in congress in russia probe,JJ NNP NN VBD IN NNP IN NNP NN,former trump adviser interview congress russia probe,Former Trump adviser interviewed in Congress in Russia probe,former trump adviser interviewed in congress in russia probe
11797,Romania senate backs bill seen in West as threat to judiciary,"[(Romania, NNP), (senate, NN), (backs, NNS), (bill, NN), (seen, VBN), (in, IN), (West, NNP), (as, IN), (threat, NN), (to, TO), (judiciary, VB)]",romania senate backs bill seen in west as threat to judiciary,NNP NN NNS NN VBN IN NNP IN NN TO VB,romania senate back bill see west threat judiciary,Romania senate backs bill seen in West as threat to judiciary,romania senate backs bill seen in west as threat to judiciary


Unnamed: 0,title,pos,cleaned_words,cleaned_pos,minimal_words,org_title,lower_title
13324,WATCH CLUELESS CLINTON SUPPORTERS Agree With Donald Trump Quotes _mytag_parentheses_,"[(WATCH, NNP), (CLUELESS, NNP), (CLINTON, NNP), (SUPPORTERS, NNP), (Agree, NNP), (With, IN), (Donald, NNP), (Trump, NNP), (Quotes, VBZ), (_mytag_parentheses_, NNS)]",watch clueless clinton supporters agree with donald trump quotes _mytag_parentheses_,NNP NNP NNP NNP NNP IN NNP NNP VBZ NNS,watch clueless clinton supporter agree donald trump quote _mytag_parentheses_,WATCH CLUELESS CLINTON SUPPORTERS Agree With Donald Trump Quotes [Video],watch clueless clinton supporters agree with donald trump quotes [video]
5462,This Is Who Will Be Speaking At The RNC – And It’s A Total Joke,"[(This, DT), (Is, VBZ), (Who, NNP), (Will, NNP), (Be, NNP), (Speaking, VBG), (At, IN), (The, DT), (RNC, NNP), (–, NNP), (And, CC), (It’s, NNP), (A, NNP), (Total, NNP), (Joke, NNP)]",this is who will be speaking at the rnc and it s a total joke,DT VBZ NNP NNP NNP VBG IN DT NNP CC NNP NNP NNP NNP,speak rnc total joke,This Is Who Will Be Speaking At The RNC – And It’s A Total Joke,this is who will be speaking at the rnc – and it’s a total joke
19167,BREAKING: TRUMP Announces “Phenomenal” Tax Cut Plan For Businesses In Next 2-3 Weeks…Stock Markets Respond _mytag_parentheses_,"[(BREAKING, NN), (:, :), (TRUMP, NNP), (Announces, NNP), (“Phenomenal”, NNP), (Tax, NNP), (Cut, NNP), (Plan, NNP), (For, IN), (Businesses, NNP), (In, IN), (Next, NNP), (2-3, JJ), (Weeks…Stock, NNP...",breaking : trump announces phenomenal tax cut plan for businesses in next 2 3 weeks stock markets respond _mytag_parentheses_,NN : NNP NNP NNP NNP NNP NNP IN NNP IN NNP JJ NNP NNPS NNP NN,breaking trump announces phenomenal tax cut plan business next weeks stock market respond _mytag_parentheses_,BREAKING: TRUMP Announces “Phenomenal” Tax Cut Plan For Businesses In Next 2-3 Weeks…Stock Markets Respond [VIDEO],breaking: trump announces “phenomenal” tax cut plan for businesses in next 2-3 weeks…stock markets respond [video]
1469,Trump Gives Newt Gingrich’s Wife Job As Vatican Ambassador In Latest Bit Of Overt Corruption,"[(Trump, NNP), (Gives, NNP), (Newt, NNP), (Gingrich’s, NNP), (Wife, NNP), (Job, NNP), (As, IN), (Vatican, NNP), (Ambassador, NNP), (In, IN), (Latest, NNP), (Bit, NNP), (Of, IN), (Overt, NNP), (Cor...",trump gives newt gingrich s wife job as vatican ambassador in latest bit of overt corruption,NNP NNP NNP NNP NNP NNP IN NNP NNP IN NNP NNP IN NNP NNP,trump give newt gingrich wife job vatican ambassador latest bit overt corruption,Trump Gives Newt Gingrich’s Wife Job As Vatican Ambassador In Latest Bit Of Overt Corruption,trump gives newt gingrich’s wife job as vatican ambassador in latest bit of overt corruption
4768,"WATCH: Trump Surrogate Tries To Spin Taco Truck Fiasco, Gets Laughed At _mytag_parentheses_","[(WATCH, NN), (:, :), (Trump, NNP), (Surrogate, NNP), (Tries, NNPS), (To, TO), (Spin, NNP), (Taco, NNP), (Truck, NNP), (Fiasco, NNP), (,, ,), (Gets, NNP), (Laughed, NNP), (At, IN), (_mytag_parenth...",watch : trump surrogate tries to spin taco truck fiasco gets laughed at _mytag_parentheses_,NN : NNP NNP NNPS TO NNP NNP NNP NNP NNP NNP IN NNP,watch trump surrogate try spin taco truck fiasco get laughed _mytag_parentheses_,"WATCH: Trump Surrogate Tries To Spin Taco Truck Fiasco, Gets Laughed At (VIDEO)","watch: trump surrogate tries to spin taco truck fiasco, gets laughed at (video)"
17621,BALL FAMILY MEMBER Reveals SICK Reason LaVar Ball Allegedly Wouldn’t Let His Sons See Their Mother After “Surgery To Remove Portion of Her Skull”,"[(BALL, NNP), (FAMILY, NNP), (MEMBER, NNP), (Reveals, NNP), (SICK, NNP), (Reason, NNP), (LaVar, NNP), (Ball, NNP), (Allegedly, RB), (Wouldn’t, NNP), (Let, NNP), (His, PRP$), (Sons, NNP), (See, NNP...",ball family member reveals sick reason lavar ball allegedly wouldn t let his sons see their mother after surgery to remove portion of her skull,NNP NNP NNP NNP NNP NNP NNP NNP RB NNP NNP PRP$ NNP NNP NNP NNP IN NN TO NNP NNP IN NNP NNP,ball family member reveals sick reason lavar ball allegedly let son see mother surgery remove portion skull,BALL FAMILY MEMBER Reveals SICK Reason LaVar Ball Allegedly Wouldn’t Let His Sons See Their Mother After “Surgery To Remove Portion of Her Skull”,ball family member reveals sick reason lavar ball allegedly wouldn’t let his sons see their mother after “surgery to remove portion of her skull”
16911,BARACK OBAMA Finds Friend In “Fundamental Transformation Of America”: Shocking Way Ryan Betrayed Americans With 1.1 Trillion Bill,"[(BARACK, NNP), (OBAMA, NNP), (Finds, NNP), (Friend, NNP), (In, IN), (“Fundamental, JJ), (Transformation, NN), (Of, IN), (America”, NNP), (:, :), (Shocking, VBG), (Way, NNP), (Ryan, NNP), (Betraye...",barack obama finds friend in fundamental transformation of america : shocking way ryan betrayed americans with 1 . 1 trillion bill,NNP NNP NNP NNP IN JJ NN IN NNP : VBG NNP NNP NNP NNPS IN CD NNP NNP,barack obama find friend fundamental transformation america shock way ryan betrayed american trillion bill,BARACK OBAMA Finds Friend In “Fundamental Transformation Of America”: Shocking Way Ryan Betrayed Americans With $1.1 Trillion Bill,barack obama finds friend in “fundamental transformation of america”: shocking way ryan betrayed americans with $1.1 trillion bill
8907,The Daily Show Puts Vile Fox Host In Her Place For Accusing President Obama Of Faking Tears _mytag_parentheses_,"[(The, DT), (Daily, NNP), (Show, NNP), (Puts, NNP), (Vile, NNP), (Fox, NNP), (Host, NNP), (In, IN), (Her, NNP), (Place, NNP), (For, IN), (Accusing, NNP), (President, NNP), (Obama, NNP), (Of, IN), ...",the daily show puts vile fox host in her place for accusing president obama of faking tears _mytag_parentheses_,DT NNP NNP NNP NNP NNP NNP IN NNP NNP IN NNP NNP NNP IN NNP NNP VBD,daily show put vile fox host place accusing president obama faking tear _mytag_parentheses_,The Daily Show Puts Vile Fox Host In Her Place For Accusing President Obama Of Faking Tears (VIDEO),the daily show puts vile fox host in her place for accusing president obama of faking tears (video)
11457,IRONY ALERT! DC’S DAY WITHOUT WOMEN Literally Led By A Man…Event Turns Into Anti-Trump Rally: “He is wrong. We have to stop him.” _mytag_parentheses_,"[(IRONY, NNP), (ALERT, NNP), (!, .), (DC’S, NNP), (DAY, NNP), (WITHOUT, NNP), (WOMEN, NNP), (Literally, NNP), (Led, NNP), (By, IN), (A, NNP), (Man…Event, NNP), (Turns, NNP), (Into, NNP), (Anti-Tru...",irony alert ! dc s day without women literally led by a man event turns into anti trump rally : he is wrong . we have to stop him . _mytag_parentheses_,NNP NNP . NNP NNP NNP NNP NNP NNP IN NNP NNP NNP NNP NNP NNP : NN VBZ IN PRP VBP TO VB NN NN,irony alert day without woman literally led man event turn anti trump rally wrong stop _mytag_parentheses_,IRONY ALERT! DC’S DAY WITHOUT WOMEN Literally Led By A Man…Event Turns Into Anti-Trump Rally: “He is wrong. We have to stop him.” [Video],irony alert! dc’s day without women literally led by a man…event turns into anti-trump rally: “he is wrong. we have to stop him.” [video]
2850,CNN Smacks Trump With Inauguration Day Ratings Facts After He Praises Fox News,"[(CNN, NNP), (Smacks, NNP), (Trump, NNP), (With, IN), (Inauguration, NNP), (Day, NNP), (Ratings, NNP), (Facts, NNP), (After, IN), (He, PRP), (Praises, VBZ), (Fox, NNP), (News, NNP)]",cnn smacks trump with inauguration day ratings facts after he praises fox news,NNP NNP NNP IN NNP NNP NNP NNP IN PRP VBZ NNP NNP,cnn smack trump inauguration day rating fact praise fox news,CNN Smacks Trump With Inauguration Day Ratings Facts After He Praises Fox News,cnn smacks trump with inauguration day ratings facts after he praises fox news


In [36]:
df0.to_csv('data/TrueOrganized.csv',index=False)
df1.to_csv('data/FakeOrganized.csv',index=False)

# Conclusion
We finished cleaning and organization.