# Organize with EDA
I'm taking a look only for title, but eventually, both title and text will be used.
- For future: mind dots (handle dots without hurting sentence tokenization) in text
- Ignore comment about text for now. 

In [127]:
from freq_utils import *
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.help import upenn_tagset
from nltk.tokenize import TreebankWordTokenizer

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, RegexpParser
from nltk.corpus import stopwords, wordnet

from collections import Counter
import time

pd.options.display.max_colwidth = 200

In [2]:
df0_org = pd.read_csv('True.csv')
df1_org = pd.read_csv('Fake.csv')

In [3]:
# Drop data we don't use (from eda_raw.ipynb)
df0_org.drop(['text','subject','date'], axis=1, inplace=True)
df1_org.drop(['text','subject','date'], axis=1, inplace=True)

df0_org.drop_duplicates()
df1_org.drop_duplicates()

df1_org = df1_org[df1_org.title.str.split().str.len()>2]
#df0 = df0[df0.text.str.split().str.len()>19]

In [4]:
# To compare modification result
df0 = df0_org
df1 = df1_org

In [5]:
def print_sentences_with_this_string(this_string, column_to_look, df_list, df_names, 
                                     print_words=False, print_set=False, sent_token=False):
    
    n_dataFrame = len(df_list)
    
    pat = re.compile(this_string)
    
    set_list = []
    
    for i in range(n_dataFrame):
        df = df_list[i]
        df = df[df[column_to_look].str.contains(this_string, regex= True, na=False)]
        
        count = df[column_to_look].count()
        
        
        print(this_string,'in',column_to_look,'\n',df_names[i],':',count)
        
        if count==0:
            continue
        
        
        if print_set:
            df = df.sample(min(len(df),1000), random_state=20)
        else:
            df = df.sample(min(len(df),20), random_state=20)
        
        corpus_list = df[column_to_look].to_numpy()
        index_list  = df.index.to_numpy()
                
        example_df = pd.DataFrame(columns=['index','selected_text','selected_words'])
        
      
        for row in range(len(index_list)):
            
            if sent_token:
                sentences = sent_tokenize(corpus_list[row])
            
                display_text = ''
                display_word = []
            
                for sentence in sentences:
                
                    if pat.search(sentence):
                        display_text += sentence+' '
                        display_word += pat.findall(sentence)
                                        
                example_df.loc[row] = [index_list[row],display_text,display_word]
            else:
                if pat.search(corpus_list[row]):
               
                    display_text = corpus_list[row]
                    display_word = pat.findall(display_text)
                    example_df.loc[row] = [index_list[row],display_text,display_word]
                
            
            
        example_df.set_index('index')
        
        if print_set:    
            #word_set = set()
            word_set = list()
            
            lst_list = list(example_df.selected_words)
            
            for lst in lst_list:
                word_set += lst
                
            #print(word_set)
            
            word_counter = Counter(word_set)
            print(word_counter.most_common(200))
        

        if not print_words:
            example_df.drop(['selected_words'], axis=1, inplace=True)


        
        display(example_df.sample(min(len(df),20), random_state=20))
        
        if print_set:
            set_list.append(word_set)
            
    if print_set:
        return set_list

# Digital source

In [6]:
%%script false --no-raise-error


print_sentences_with_this_string('[^\s]*[@]+[^\s]*',        
                                 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

print_sentences_with_this_string('[^\s]*//[^\s]+[.][^\s]+', 
                                 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings
- Overall, **fake news quote digital sources much more frequently** than real news. 

#### @ in title
- Real news: only one news has it, when its **topic is about social media account**
- Fake news: some are about **social media account**, but some are **slangs** (used like "*")

#### @ in text
- Both real and fake news have @ to **refer social media accounts**.
- 20 times **more frequently** used in fake news

#### website in text
- **Few real news** contains the website address in this dataset.
- **A lot of fake news** contains website address. Examples of them were CNN news, Facebook, and YouTube address.

### Processing
- There are only a few rows, so let's simply change **@ to _**

### Replace them to tags

In [7]:
df0 = df0.replace(to_replace='@', value='_mytag_at_', regex=True, inplace=False)
df1 = df1.replace(to_replace='@', value='_mytag_at_', regex=True, inplace=False)

# Slang

In [8]:
%%script false --no-raise-error


print_sentences_with_this_string('[^\s]*[\*]+[^\s]*', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings
- No real news has * in titles.
- Some **fake news** have * in **title** to display **slangs**.
- Both real and fake news have **\* in texts**, 14 times **frequently occur in fake news**.
- When * is used **in the text**, it is **not always for slangs** (e.g. to emphasize). 
- It is not **hard to separate** usage of star between **slang and highlighting** based on text pattern.
- I'll mark both of those works as `_mytag_slang_` since they have a common meaning and function, **highlighting, anyway.

### Processing
- Tag words contain * as **slang** (only for title)

In [9]:
# replace works with *
df0.replace(to_replace='[^\s]*[\*]+[^\s]*', value='_mytag_slang_', regex=True, inplace=True)
df1.replace(to_replace='[^\s]*[\*]+[^\s]*', value='_mytag_slans_', regex=True, inplace=True)

# Other special characters
As seen from the slang character tagging, some special character replaces an alphabet character, therefore, blindly removing all special characters might leave some words meaningless.
Let's check how the other special characters are used.

In [10]:
%%script false --no-raise-error

sc_title = print_sentences_with_this_string('[^\s\w]', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [11]:
%%script false --no-raise-error

c0 = Counter(sc_title[0])
c1 = Counter(sc_title[1])

print(c0)
print(c1)

print(set([x[0] for x in c0.most_common(5)]))


sc_fake_only = set([x[0] for x in c1]) - set([x[0] for x in c0.most_common(5)])


print(sc_fake_only)

### Findings
Fake news are more noisy having more kind of special characters. Special characters not used in real news might have a special function. Let's take a look.

In [12]:
%%script false --no-raise-error

for x in c0:
    
    #sc_regex = '[\\' + x + ']'
    sc_regex = '[^\s]*[\\' + x + '][^\s]*'

    print(x)
    print_sentences_with_this_string(sc_regex, 'title', [df0, df1], ['Real','Fake'], print_words=True, print_set=True)

In [13]:
%%script false --no-raise-error

for x in sc_fake_only:
    
    #sc_regex = '[\\' + x + ']'
    sc_regex = '[^\s]*[\\' + x + '][^\s]*'

    print(x)
    print_sentences_with_this_string(sc_regex, 'title', [df1], ['Fake'], print_words=True, print_set=True)

### Findings
Usages
- \# : hashtag, tv show episode, website address 
- % : percent, not used in real news, interestingly
- -- (longer than a hyphen) : hyphen, some slang but ignorable
- ! : exclamation, not used in real news, but remove it in case real news happen to have an exclamation mark is classified as fake
- [] (): clickbait, emphasis
- } : seems a typo of ] in case of title, in text, it looks like a script. Token with } in text better be removed.
- & : and or special words (e.g. Q&A, AT&T)
- \$ : dollor or slang
- \/ : 9/11, 24/7, or clickbait (e.g. video/image)

### Note for processings

- \/ : **replace to a space*
- [] () {} : **remove with enclosed text** to avoid a strong bias of this dataset
- : : **replace to a space** if it is between two numbers (time), **replace to ;** otherwise 
- ;, ... : **replace to \.**
- Abc. (abbreviation has one dot at the end): **remove dot**

Tokenize sentence. Then

- — : **replace to -** then do the same as below
- \- : **leave it** if it is hyphen (between two words without space), **remove** otherwise
- \$ : **remove** if followed by a number, **replace to \_** otherwise (slangs)
- \& : **replace to "and"** if spaced, **replace to \_ otherwise"
- \% : **replace to " percent "**

- \# : remove (words are either special noun or number)
- !, ?, , : remove
- \" : remove

Tokenize word. Then
- \' : remove
- handle abbreviation

# Capital letters

In [14]:
%%script false --no-raise-error

print_sentences_with_this_string('[\s^\w][A-Z][^\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings
As from EDA, words with capicalized first characters are proper nouns. Name entity recognition would recognize some of them (e.g. "Trump"), but some woudn't (e.g. "White", "House"). However, bigram or trigram would catch such case.

In [15]:
%%script false --no-raise-error

title0 = df0.sample(1000, random_state=9).title.tolist()
title1 = df1.sample(1000, random_state=9).title.tolist()

# Make a bigram list
# Subtract trigram
bi_real = ngram_tokenizer(title0, n=2)
bi_fake = ngram_tokenizer(title1, n=2)
tr_real = ngram_tokenizer(title0, n=3)
tr_fake = ngram_tokenizer(title1, n=3)
qd_real = ngram_tokenizer(title0, n=4)
qd_fake = ngram_tokenizer(title1, n=4)

def combine_uppercase_words(lst):

    rg = re.compile('[A-Z]')
    
    co = Counter(lst)  
    ngram_lst  = [x[0] for x in list(co.most_common(100))]
    
    lst_combine=[]

    for x in ngram_lst:
        # x: bigram or trigram tuple
        ngram = len(x)
        
        if set([bool(rg.search(x[i][0])) for i in range(ngram)])=={True}:
            lst_combine.append(x)
    
    return ngram_lst, lst_combine
            
print(combine_uppercase_words(bi_real),'\n')            
print(combine_uppercase_words(tr_real),'\n')            
print(combine_uppercase_words(qd_real),'\n')            
print(combine_uppercase_words(bi_fake),'\n')            
print(combine_uppercase_words(tr_fake),'\n')            
print(combine_uppercase_words(qd_fake),'\n') 

bi = combine_uppercase_words(bi_real)[1]
tr = combine_uppercase_words(tr_real)[1]

for t in tr:
    for b in bi:
        if set(b).issubset(set(t)):
            print(b, t)

### Note for preprocessings
- Combine \<Capital start\> + \<Capical start\> words (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong)

# Abbreviation

In [16]:
%%script false --no-raise-error

# US, USA, UN, UK... -> Add two "_" at the end
print_sentences_with_this_string('[A-Z][A-Z]+[\.]?', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [17]:
%%script false --no-raise-error

# Abbreviation with a space between, like U. S.?
print_sentences_with_this_string('[A-Z][\w]*[.][\s][A-Z][\w]*[.]', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [18]:
%%script false --no-raise-error

# Abbreviation with lower cases?
print_sentences_with_this_string('[a-z][\w]*[.][a-z][\w]*[.][^\s]*', 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [19]:
%%script false --no-raise-error

# U.S., Dr.
print_sentences_with_this_string('[A-Z][\w]*[\.][\w\.]*', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [20]:
%%script false --no-raise-error

# single character words
print_sentences_with_this_string('[\s]+[A-Z][\.][\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [21]:
%%script false --no-raise-error

# two character words
print_sentences_with_this_string('[\s]+[A-Z][\.]?[A-Z][\.]?[\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [22]:
%%script false --no-raise-error

# Words have only one dot at the end
print_sentences_with_this_string('[A-Z][\w]+[\.][\s]+', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

In [23]:
%%script false --no-raise-error

# Check other dot examples
print_sentences_with_this_string('[^\s]*[\.][^\s]*', 'title', 
                                 [df0,df1], ['True','Fake'], print_words=True, print_set=True)

### Findings

- The dot in any abbreviation will interrupt sentence tokenization.
    - Abbreviation at the end of a sentence
        - 'I'm in the **U.S.** That is a news!' tokenized into **one** sentence.
        - 'I'm in the **Dept.** That is a news!' tokenized into **two** sentences.
    - Abbreviation in the middle of a sentence
        - 'I'm in the **U.S.** now!' tokenized into **one** sentence.
        - 'I'm in the **Dept.** now!' tokenized into **two** sentences.

- Due to **irregular capitalization/formatting rules in fake news**, distinguish these words relying on text format **without context** seems **impossible**.
- We can ignore possible spaces between abbreviation because they didn't happen (e.g. U. S.).
- May (month) vs may (modal verb) vs May (name) is hard to distinguish. I'll leave it up to the learning a context.

Here is a note about most frequent words.


#### Bigram/Trigram
- Some words start with upper case should be combined to have meaning (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong)
- No point to use fake news bi/trigram to replace Capital+Capical words because it is full of noise.
- For 'House', 'Speaker' and 'Ryan', 'Ryan' should be separated.
- For now, let's leave it up to learning

#### Examples of words that should be recognized as a same word
- **US, U.S, U.S. or U.S.A.**: variation of the United States, comes from fake news or typo. 

#### Examples of words that should be recognized as distinct words
- **PM (Prime Minister)**, **P.M. (Post Meridiem)**, and **p.m. (post meridiem)**
- **IS (Islamic State)** and **is (be verb)**
- **No. (number)** and **no (opposite of yes)**
- **IT (information technology) ** and **it (pronoun)**

#### Single or two characters words
- They can be removed by **stop word** removal. 
- Most of **single letters** for middle name except **N. in North Korea**. It's ok to drop middle names.
- For **two characters** words, add extra \_ at the end in order not to be disappeared.

#### Words have only one dot at the end
- These words can distort sentence tokenization.
- They are followed by a proper noun (**Dr., Sen., Jr.**), which is ok to be **removed**, or name of month (**Jul.**). Either case, **removing dot** would be enough.

### Note for preprocessings
- Change **N. Korea**, **N.Korea** to **North Korea**
- Combine \<Capital start\> + \<Capical start\> words (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong)
- Change abbreviations
    - U.N. : \_u_n_
    - Rep. : Rep
    - Sept. : Sep
    - Sen. : Sen
    - Gov. : Gov

# Preprocessings

#### 1. Least interfering preprocessings

1. Replace N. Korea, N.Korea to North Korea (frequently occuring topic)
2. Remove single letter capical word (e.g. middle name)
3. Abbreviation
    - Remove a dot at the end of a Month word starts with an upper case. (Jul. -> Jul)
    - No. to number
    - U.N. : \_u_n_
    - Rep. : Rep
    - Sept. : Sep
    - Sen. : Sen
    - Gov. : Gov
    - PM : \_p_m_
    - P.M. (Post Meridiem), and p.m. (post meridiem): \_mytag_pm_ (same for a.m.)
    - US, U.S, U.S. or U.S.A.: \_u_s_
4. Special characters 
    - / : a space
    - [] () {} : \_mytag_parentheses_
    - $ : remove (not care about a few usages for slang)
    - & : replace to "and" if spaced, replace to an underbar otherwise
    - % : replace to " percent "
    - \# : replace to an underbar  (words are either special noun or number)
5. Special characters (after abbreviation handling)
    - ... : a space
    - : : ~replace to a space if it is between two numbers (time), replace to . otherwise~ leave it to tokenizer
    - ; : ~replace to \.~ leave it to tokenizer


#### ~2. After sentence tokenization~ No sentenct tokenization for title
1. ~Bigram: words that have different meaning if used alone: Combine \<Capital start\> + \<Capical start\> words (e.g. White+House, North+Korea, Puerto+Rico, Hong+Kong) with an underbar, keep capicalization~ leave it to learning
        


#### 3. After word tokenization
1. Special characters
    - — (en dash? em dash?): replace to - (hyphen) then do the same as below
    - \- : replace it to an underbar if it is hyphen (between two words without space), remove otherwise
    - !, ?, , : remove
    - \" : remove
2. Abbreviation: AB, A.B -> \_a_b_


#### 4. After PoS tagging
    - \' : remove    
    - Uncapitalization

<Text items >
<Remove news id (location, reuters) >
<Replace month abbreviations>
<july, jul., Jul, Jul., July : \_mytag_month_july_ >
<May: \_mytag_month_may_ >


## Least interfering preprocessings
Includes handlings for better sentence tokenization

In [24]:
def replace_regex(old_exp, new_exp='', verbose=True):
    
    if verbose:
        print_sentences_with_this_string(old_exp, 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)
    
    if not new_exp=='':
        df0.replace(to_replace=old_exp, value=new_exp, regex=True, inplace=True)
        df1.replace(to_replace=old_exp, value=new_exp, regex=True, inplace=True)
        if verbose:
            print(old_exp,'replaced to',new_exp)
            print_sentences_with_this_string(new_exp, 'title', [df0,df1], ['True','Fake'], print_words=True, print_set=True)


In [25]:
# Replace N. Korea, N.Korea to North Korea
old_exp = '(?:[\s]|^)[N][\.][\s]?Korea'
new_exp = ' North Korea'
replace_regex(old_exp, new_exp, verbose=False)

In [26]:
# Remove single letter capical word (e.g. middle name)
old_exp = '(?:[\s]|^)[A-Z][\.](?:[\s]|$)'
new_exp = ' '
replace_regex(old_exp, new_exp, verbose=False)


In [27]:
# No. to No
# U.N. : _u_n_
# Rep. : Rep
# Sept. : Sep
# Sen. : Sen
# Gov. : Gov
# PM : \_p_m_
# **P.M. (Post Meridiem)**, and **p.m. (post meridiem)**: \_mytag_pm_ (same for a.m.)
# **US, U.S, U.S. or U.S.A.**: \_u_s_

      
#old_exp = '(?:[\s]|^)[N][o][\.]'
#new_exp = ' No '
old_exp = '(?:[\s]|^)[N][o][\.]'
new_exp = ' No. '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[U][\.]?[N][\.]?(?:[\s]|$)'
new_exp = ' _u_n_ '
#new_exp = ' UN '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Rr][Ee][Pp][\.]'
new_exp = ' Rep '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ss][Ee][Pp][Tt][\.]'
new_exp = ' Sept '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ss][Ee][Nn][\.]'
new_exp = ' Sen '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Gg][Oo][Vv][\.]'
new_exp = ' Gov '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[P][M](?:[\s]|$)'
new_exp = ' _p_m_ '
#new_exp = ' PM '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Pp][\.][Mm][\.](?:[\s]|$)'
new_exp = ' _mytag_pm_ '
#new_exp = ' p.m. '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Aa][\.][Mm][\.](?:[\s]|$)'
new_exp = ' _mytag_am_ '
#new_exp = ' a.m. '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '(?:[\s]|^)[U][\.]?[S][\.]?[A]?[\.]?(?:[\s]|$)'
new_exp = ' _u_s_ '
#new_exp = ' U.S. '
replace_regex(old_exp,new_exp, verbose=False)  

In [28]:
# Remove a dot at the end of a word starts with an upper case.
old_exp = '(?:[\s]|^)[Jj][Aa][Nn][\.]'
new_exp = ' Jan '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ff][Ee][Bb][\.]'
new_exp = ' Feb '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Mm][Aa][Rr][\.]'
new_exp = ' Mar '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Pp][Rr][\.]'
new_exp = ' Apr '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Jj][Uu][Nn][\.]'
new_exp = ' Jun '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Jj][Uu][Ll][\.]'
new_exp = ' Jul '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Aa][Uu][Gg][\.]'
new_exp = ' Aug '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Ss][Ee][Pp][\.]'
new_exp = ' Sep '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Oo][Cc][Tt][\.]'
new_exp = ' Oct '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Nn][Oo][Vv][\.]'
new_exp = ' Nov '
replace_regex(old_exp, new_exp, verbose=False)

old_exp = '(?:[\s]|^)[Dd][Ee][Cc][\.]'
new_exp = ' Dec '
replace_regex(old_exp, new_exp, verbose=False)

In [29]:
# / : a space
# $ : **remove** (not care about a few usages for slang)
# % : **replace to " percent "**
# # : **replace to an underbar**  (to tag proper noun, words are either special noun or number)
# & : **replace to "and"** if spaced, **replace to an underbar otherwise"

# [] () {} : \_mytag_parentheses_


old_exp = '[\/]'
new_exp = ' '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\$]'
new_exp = ' '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\%]'
new_exp = ' percent '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\#]'
new_exp = '_'
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\s][\&][\s]'
new_exp = ' and '
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\&]'
new_exp = '_'
replace_regex(old_exp,new_exp, verbose=False)

old_exp = '[\[\{\(][\s]?[\w]+[\s]?[\]\}\)]'
new_exp = ' _mytag_parentheses_ '
replace_regex(old_exp,new_exp, verbose=False)

In [30]:
# dot dot dot
old_exp = '[\.][\.]+'
new_exp = ' '
replace_regex(old_exp, new_exp, verbose=False)

## 2. Word tokenize and PoS Tagging

In [126]:
# Abbreviation: AB, A.B, A.B. -> _a_b_ already done for frequent words

def tokenizer(corpus, verbose=False):

    tb_tokenizer = TreebankWordTokenizer()
    
    words = tb_tokenizer.tokenize(corpus)
    
    pos = pos_tag(words)
        
    return pos
    

df0['pos'] = df0.title.apply(tokenizer)
df1['pos'] = df1.title.apply(tokenizer)

In [185]:
# For Naive Bayse model

c = df1.loc[10]

#print(c.title,'\n', c.pos)

def convert_pos(pos_only):
    
    #word_pos = pos_tag([word])

    tag = ''
    try:
        tag = pos_only[:2]
    except:
        tag = 'n'
    
    if tag == 'JJ':
        tag = 'a'
    elif tag == 'NN':
        tag = 'n'
    elif tag == 'RB':
        tag = 'r'
    elif tag == 'VB':
        tag = 'v'
    else:
        tag = 'n'
        
    return tag

def gen_organized_column(pos_tag_series):
    
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    rgx = re.compile('[\w]+[\'\w+]?|[\:\;\!\?]')
    
    col_words = []
    col_minimal_words = []
    col_pos = []

    for pos_tag_row in pos_tag_series:
        
        words_list = []
        minimal_words_list = []
        pos_list = []

        for pair in pos_tag_row:
    
            token_list = rgx.findall(pair[0].lower())
            pos = pair[1]
    
            # skip special character token
            if not bool(token_list):
                continue
            
            
            token = ' '.join(token_list)
            
            #print(token,pos)

            words_list.append(token)
            pos_list.append(pos)
            
            
            # Minimal words
            token_list = [x for x in token_list if not x in stop_words and len(x)>2]
            
            
            if bool(token_list):
                token = ' '.join(token_list)
                token = token.lower()
                minimal_words_list.append(lemmatizer.lemmatize(token, convert_pos(pos)))
        
        col_words.append(words_list)
        col_minimal_words.append(minimal_words_list)
        col_pos.append(pos_list)
        
    if not len(pos_tag_series)==len(col_words) or \
        not len(pos_tag_series)==len(col_minimal_words) or \
        not len(pos_tag_series)==len(col_pos):
        
        return 'Error: array length does not match'
    else:
        return  pd.Series(col_words, index=pos_tag_series.index), \
                pd.Series(col_minimal_words, index=pos_tag_series.index), \
                pd.Series(col_pos, index=pos_tag_series.index)

In [186]:
col_words0, col_minimal_words0, col_pos0 = gen_organized_column(df0.pos)
col_words1, col_minimal_words1, col_pos1 = gen_organized_column(df1.pos)

df0['cleaned_words'] = col_words0
df0['minimal_words'] = col_minimal_words0
df0['cleaned_pos'] = col_pos0

df1['cleaned_words'] = col_words1
df1['minimal_words'] = col_minimal_words1
df1['cleaned_pos'] = col_pos1

df0['org_title'] = df0_org.title
df1['org_title'] = df1_org.title

In [187]:
display(df0.sample(10))
display(df1.sample(10))


Unnamed: 0,title,pos,cleaned_words,minimal_words,cleaned_pos,org_title
16151,Kurdish oil flows to Turkey resume after technical stoppage: shipping source,"[(Kurdish, JJ), (oil, NN), (flows, VBZ), (to, TO), (Turkey, NNP), (resume, NN), (after, IN), (technical, JJ), (stoppage, NN), (:, :), (shipping, NN), (source, NN)]","[kurdish, oil, flows, to, turkey, resume, after, technical, stoppage, :, shipping, source]","[kurdish, oil, flow, turkey, resume, technical, stoppage, shipping, source]","[JJ, NN, VBZ, TO, NNP, NN, IN, JJ, NN, :, NN, NN]",Kurdish oil flows to Turkey resume after technical stoppage: shipping source
21001,Venezuelan President Maduro will not go to _u_n_ rights forum,"[(Venezuelan, NNP), (President, NNP), (Maduro, NNP), (will, MD), (not, RB), (go, VB), (to, TO), (_u_n_, VB), (rights, NNS), (forum, NN)]","[venezuelan, president, maduro, will, not, go, to, _u_n_, rights, forum]","[venezuelan, president, maduro, _u_n_, right, forum]","[NNP, NNP, NNP, MD, RB, VB, TO, VB, NNS, NN]",Venezuelan President Maduro will not go to U.N. rights forum
1679,Anti-Assad nations say no to Syria reconstruction until political process on track,"[(Anti-Assad, JJ), (nations, NNS), (say, VBP), (no, DT), (to, TO), (Syria, NNP), (reconstruction, NN), (until, IN), (political, JJ), (process, NN), (on, IN), (track, NN)]","[anti assad, nations, say, no, to, syria, reconstruction, until, political, process, on, track]","[anti assad, nation, say, syria, reconstruction, political, process, track]","[JJ, NNS, VBP, DT, TO, NNP, NN, IN, JJ, NN, IN, NN]",Anti-Assad nations say no to Syria reconstruction until political process on track
15718,"Fake meat, free markets ease North Koreans' hunger","[(Fake, NNP), (meat, NN), (,, ,), (free, JJ), (markets, NNS), (ease, VBP), (North, NNP), (Koreans, NNP), (', POS), (hunger, NN)]","[fake, meat, free, markets, ease, north, koreans, hunger]","[fake, meat, free, market, ease, north, korean, hunger]","[NNP, NN, JJ, NNS, VBP, NNP, NNP, NN]","Fake meat, free markets ease North Koreans' hunger"
1085,Abadi defends role of Iranian-backed paramiltaries at meeting with Tillerson,"[(Abadi, NNP), (defends, VBZ), (role, NN), (of, IN), (Iranian-backed, JJ), (paramiltaries, NNS), (at, IN), (meeting, VBG), (with, IN), (Tillerson, NNP)]","[abadi, defends, role, of, iranian backed, paramiltaries, at, meeting, with, tillerson]","[abadi, defend, role, iranian backed, paramiltaries, meet, tillerson]","[NNP, VBZ, NN, IN, JJ, NNS, IN, VBG, IN, NNP]",Abadi defends role of Iranian-backed paramiltaries at meeting with Tillerson
6906,_u_s_ top court weighs race challenges to legislative districts,"[(_u_s_, JJ), (top, JJ), (court, NN), (weighs, VBD), (race, NN), (challenges, NNS), (to, TO), (legislative, JJ), (districts, NNS)]","[_u_s_, top, court, weighs, race, challenges, to, legislative, districts]","[_u_s_, top, court, weigh, race, challenge, legislative, district]","[JJ, JJ, NN, VBD, NN, NNS, TO, JJ, NNS]",U.S. top court weighs race challenges to legislative districts
3223,Republicans retreat from plan to curb some press camera access in _u_s_ Capitol,"[(Republicans, NNPS), (retreat, VBP), (from, IN), (plan, NN), (to, TO), (curb, VB), (some, DT), (press, NN), (camera, NN), (access, NN), (in, IN), (_u_s_, NNP), (Capitol, NNP)]","[republicans, retreat, from, plan, to, curb, some, press, camera, access, in, _u_s_, capitol]","[republican, retreat, plan, curb, press, camera, access, _u_s_, capitol]","[NNPS, VBP, IN, NN, TO, VB, DT, NN, NN, NN, IN, NNP, NNP]",Republicans retreat from plan to curb some press camera access in U.S. Capitol
5930,Trump refugee order dashes hopes of Iraqis who helped the _u_s_,"[(Trump, NNP), (refugee, NN), (order, NN), (dashes, NNS), (hopes, NNS), (of, IN), (Iraqis, NNP), (who, WP), (helped, VBD), (the, DT), (_u_s_, NN)]","[trump, refugee, order, dashes, hopes, of, iraqis, who, helped, the, _u_s_]","[trump, refugee, order, dash, hope, iraqi, help, _u_s_]","[NNP, NN, NN, NNS, NNS, IN, NNP, WP, VBD, DT, NN]",Trump refugee order dashes hopes of Iraqis who helped the U.S.
1075,Trump: 'No one really knows' how much tax plan would generate,"[(Trump, NN), (:, :), ('No, CC), (one, CD), (really, RB), (knows, VBZ), (', POS), (how, WRB), (much, JJ), (tax, NN), (plan, NN), (would, MD), (generate, VB)]","[trump, :, no, one, really, knows, how, much, tax, plan, would, generate]","[trump, one, really, know, much, tax, plan, would, generate]","[NN, :, CC, CD, RB, VBZ, WRB, JJ, NN, NN, MD, VB]",Trump: 'No one really knows' how much tax plan would generate
16230,Yemeni Salafist imam killed in Aden: sources,"[(Yemeni, NNP), (Salafist, NNP), (imam, NN), (killed, VBN), (in, IN), (Aden, NNP), (:, :), (sources, NNS)]","[yemeni, salafist, imam, killed, in, aden, :, sources]","[yemeni, salafist, imam, kill, aden, source]","[NNP, NNP, NN, VBN, IN, NNP, :, NNS]",Yemeni Salafist imam killed in Aden: sources


Unnamed: 0,title,pos,cleaned_words,minimal_words,cleaned_pos,org_title
3915,Trump Gets His _mytag_slans_ Handed To Him For Running Off Stage Because Some Guy Had A Sign,"[(Trump, NNP), (Gets, VBZ), (His, PRP$), (_mytag_slans_, NN), (Handed, VBD), (To, TO), (Him, NNP), (For, IN), (Running, VBG), (Off, NNP), (Stage, NN), (Because, IN), (Some, DT), (Guy, NNP), (Had, ...","[trump, gets, his, _mytag_slans_, handed, to, him, for, running, off, stage, because, some, guy, had, a, sign]","[trump, get, _mytag_slans_, hand, run, stage, guy, sign]","[NNP, VBZ, PRP$, NN, VBD, TO, NNP, IN, VBG, NNP, NN, IN, DT, NNP, VBD, DT, NN]",Trump Gets His A** Handed To Him For Running Off Stage Because Some Guy Had A Sign
20409,OBAMA COMMENCEMENT SPEECH To Black Graduates: You’re Just Lucky,"[(OBAMA, NNP), (COMMENCEMENT, NNP), (SPEECH, NNP), (To, TO), (Black, NNP), (Graduates, NNP), (:, :), (You’re, NN), (Just, NNP), (Lucky, NNP)]","[obama, commencement, speech, to, black, graduates, :, you re, just, lucky]","[obama, commencement, speech, black, graduate, lucky]","[NNP, NNP, NNP, TO, NNP, NNP, :, NN, NNP, NNP]",OBAMA COMMENCEMENT SPEECH To Black Graduates: You’re Just Lucky
379,Trump Just Bragged About The Size Of His Hands At Hurricane Irma Relief Event _mytag_parentheses_,"[(Trump, NNP), (Just, NNP), (Bragged, NNP), (About, IN), (The, DT), (Size, NN), (Of, IN), (His, PRP$), (Hands, NNS), (At, IN), (Hurricane, NNP), (Irma, NNP), (Relief, NNP), (Event, NNP), (_mytag_p...","[trump, just, bragged, about, the, size, of, his, hands, at, hurricane, irma, relief, event, _mytag_parentheses_]","[trump, bragged, size, hand, hurricane, irma, relief, event, _mytag_parentheses_]","[NNP, NNP, NNP, IN, DT, NN, IN, PRP$, NNS, IN, NNP, NNP, NNP, NNP, NN]",Trump Just Bragged About The Size Of His Hands At Hurricane Irma Relief Event (VIDEO)
13583,BEWARE OF HILLARY CLINTON’S “Smart Power” Foreign Policy…It’s NOT Smart! _mytag_parentheses_,"[(BEWARE, NNP), (OF, NNP), (HILLARY, NNP), (CLINTON’S, NNP), (“Smart, NNP), (Power”, NNP), (Foreign, NNP), (Policy…It’s, NNP), (NOT, NNP), (Smart, NNP), (!, .), (_mytag_parentheses_, NN)]","[beware, of, hillary, clinton s, smart, power, foreign, policy it s, not, smart, !, _mytag_parentheses_]","[beware, hillary, clinton, smart, power, foreign, policy, smart, _mytag_parentheses_]","[NNP, NNP, NNP, NNP, NNP, NNP, NNP, NNP, NNP, NNP, ., NN]",BEWARE OF HILLARY CLINTON’S “Smart Power” Foreign Policy…It’s NOT Smart! [VIDEO]
3883,Trump Supporter Pulls Gun On Man Because He Refused To Vote For Trump,"[(Trump, NNP), (Supporter, NNP), (Pulls, NNP), (Gun, NNP), (On, IN), (Man, NNP), (Because, IN), (He, PRP), (Refused, VBD), (To, TO), (Vote, VB), (For, IN), (Trump, NNP)]","[trump, supporter, pulls, gun, on, man, because, he, refused, to, vote, for, trump]","[trump, supporter, pull, gun, man, refuse, vote, trump]","[NNP, NNP, NNP, NNP, IN, NNP, IN, PRP, VBD, TO, VB, IN, NNP]",Trump Supporter Pulls Gun On Man Because He Refused To Vote For Trump
2617,Sean Spicer Dreams Up Brand-New Fake Terrorist Attack To Defend Trump’s Muslim Ban,"[(Sean, JJ), (Spicer, NNP), (Dreams, NNP), (Up, NNP), (Brand-New, NNP), (Fake, NNP), (Terrorist, NNP), (Attack, NNP), (To, TO), (Defend, VB), (Trump’s, NNP), (Muslim, NNP), (Ban, NNP)]","[sean, spicer, dreams, up, brand new, fake, terrorist, attack, to, defend, trump s, muslim, ban]","[sean, spicer, dream, brand new, fake, terrorist, attack, defend, trump, muslim, ban]","[JJ, NNP, NNP, NNP, NNP, NNP, NNP, NNP, TO, VB, NNP, NNP, NNP]",Sean Spicer Dreams Up Brand-New Fake Terrorist Attack To Defend Trump’s Muslim Ban
18009,N KOREA JUST REVEALED Plans To Unleash An Unimaginable Attack That Could Lead To Electronic Armageddon,"[(N, NNP), (KOREA, NNP), (JUST, NNP), (REVEALED, NNP), (Plans, NNPS), (To, TO), (Unleash, VB), (An, DT), (Unimaginable, JJ), (Attack, NN), (That, WDT), (Could, NNP), (Lead, NNP), (To, TO), (Electr...","[n, korea, just, revealed, plans, to, unleash, an, unimaginable, attack, that, could, lead, to, electronic, armageddon]","[korea, revealed, plan, unleash, unimaginable, attack, could, lead, electronic, armageddon]","[NNP, NNP, NNP, NNP, NNPS, TO, VB, DT, JJ, NN, WDT, NNP, NNP, TO, NNP, NNP]",N KOREA JUST REVEALED Plans To Unleash An Unimaginable Attack That Could Lead To Electronic Armageddon
22023,Zakharova Slams CIA Chief Pompeo: Stop Making Up Anti-Russian Fiction,"[(Zakharova, NNP), (Slams, NNP), (CIA, NNP), (Chief, NNP), (Pompeo, NNP), (:, :), (Stop, NN), (Making, VBG), (Up, RP), (Anti-Russian, JJ), (Fiction, NN)]","[zakharova, slams, cia, chief, pompeo, :, stop, making, up, anti russian, fiction]","[zakharova, slam, cia, chief, pompeo, stop, make, anti russian, fiction]","[NNP, NNP, NNP, NNP, NNP, :, NN, VBG, RP, JJ, NN]",Zakharova Slams CIA Chief Pompeo: Stop Making Up Anti-Russian Fiction
15946,"Democrat Corey Booker Backs Single-Payer and Wide Open Borders: ‘Build Tunnels, Not Walls’ _mytag_parentheses_","[(Democrat, NNP), (Corey, NNP), (Booker, NNP), (Backs, NNP), (Single-Payer, NNP), (and, CC), (Wide, NNP), (Open, NNP), (Borders, NNS), (:, :), (‘Build, NN), (Tunnels, NNP), (,, ,), (Not, RB), (Wal...","[democrat, corey, booker, backs, single payer, and, wide, open, borders, :, build, tunnels, not, walls, _mytag_parentheses_]","[democrat, corey, booker, back, single payer, wide, open, border, build, tunnel, wall, _mytag_parentheses_]","[NNP, NNP, NNP, NNP, NNP, CC, NNP, NNP, NNS, :, NN, NNP, RB, NNP, NN]","Democrat Corey Booker Backs Single-Payer and Wide Open Borders: ‘Build Tunnels, Not Walls’ [Video]"
15729,HAS FACE BOOK SIDED WITH MUSLIM JIHADISTS AGAINST FREE SPEECH? Muhammed Cartoon Contest Winner Is Removed From Social Media Site,"[(HAS, NNP), (FACE, NNP), (BOOK, NNP), (SIDED, NNP), (WITH, NNP), (MUSLIM, NNP), (JIHADISTS, NNP), (AGAINST, NNP), (FREE, NNP), (SPEECH, NNP), (?, .), (Muhammed, NNP), (Cartoon, NNP), (Contest, NN...","[has, face, book, sided, with, muslim, jihadists, against, free, speech, ?, muhammed, cartoon, contest, winner, is, removed, from, social, media, site]","[face, book, sided, muslim, jihadist, free, speech, muhammed, cartoon, contest, winner, remove, social, medium, site]","[NNP, NNP, NNP, NNP, NNP, NNP, NNP, NNP, NNP, NNP, ., NNP, NNP, NNP, NNP, VBZ, VBN, IN, NNP, NNP, NNP]",HAS FACE BOOK SIDED WITH MUSLIM JIHADISTS AGAINST FREE SPEECH? Muhammed Cartoon Contest Winner Is Removed From Social Media Site


In [188]:
df0.to_csv('TrueOrganized')
df1.to_csv('FakeOrganized')

# Conclusion
We finished cleaning and organization.