In [1]:
import spacy
#!python -m spacy download en_core_web_sm
import re
# Loading libraries for sentiment analysis

In [2]:
def NLP_preprocess(
    nlp,
    text,
    min_token_len=3, # Only considering text feedback with more than 3 characters
    irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP", "SPACE"], # Only Considering Nouns, Verbs and adjectives 
    stop_mywords = ["after", "years", "scorning", "political",  "process", "silicon", "valley"]
):
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text
    and return a preprocessed string.
    Parameters
    -------------
    nlp : (spacy.lang.en.English)
        spacy object to tokenize the text doccument
    text : (str)
        text to be preprocessed
    min_token_len : (int)
        minimum token length required to keep a token in the preprocessed text
    irrelevant_pos : (list)
        a list of irrelevant pos tags
    Returns
    -------------
    (str) the preprocessed text
    """
    # Replace a sequence of whitespaces by a single whitespace
    text = re.sub(r"\s+", " ", text)
    # Remove other strange characters
    text = re.sub(r"""[\*\~]+""", "", text)
    # Replace slashes with spaces
    text = re.sub(r"""[\/]+""", " ", text)
    doc = nlp(text)
    clean_text = []
    for token in doc:
        if (
            token.is_stop == False  # Check if it's not a stopword
            and token.is_alpha  # Check if it's an alphanumerics char
            and not token.like_email  # Check if the token is an not like email
            and not token.like_url  # Check if the token is an not like a url
            and not token.is_punct  # Check if the token is like punctuation
            and not token.is_currency # Check if the token is like currency
            and len(token) > min_token_len  # Check if the word meets minimum threshold
            and token.pos_ not in irrelevant_pos
            and token.text.lower() not in stop_mywords
        ):  # Check if the POS is in the acceptable POS tag
            lemma = token.lemma_  # Take the lemma of the word
            clean_text.append(lemma.lower())
    return " ".join(clean_text)

In [3]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [4]:
stop_words = ["year","political"]
#NLP_preprocess(nlp,text, stop_mywords = stop_words)

In [5]:
import pandas as pd
df = pd.read_csv("fake_news/test.csv")
df

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [6]:
df_test = df.iloc[[3]]
df_test

Unnamed: 0,id,title,author,text
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."


In [7]:
df_test.text.tolist()

['If at first you don’t succeed, try a different sport. Tim Tebow, who was a Heisman   quarterback at the University of Florida but was unable to hold an N. F. L. job, is pursuing a career in Major League Baseball. He will hold a workout for M. L. B. teams this month, his agents told ESPN and other news outlets. “This may sound like a publicity stunt, but nothing could be further from the truth,” said Brodie Van Wagenen,   of CAA Baseball, part of the sports agency CAA Sports, in the statement. “I have seen Tim’s workouts, and people inside and outside the industry  —   scouts, executives, players and fans  —   will be impressed by his talent. ” It’s been over a decade since Tebow, 28, has played baseball full time, which means a comeback would be no easy task. But the former major league catcher Chad Moeller, who said in the statement that he had been training Tebow in Arizona, said he was “beyond impressed with Tim’s athleticism and swing. ” “I see bat speed and power and real baseba

In [8]:
df_test["text"].map(lambda x : NLP_preprocess(nlp, x)).tolist()

['succeed different sport tebow heisman quarterback university florida unable hold pursue career major league baseball hold workout team month agent tell espn news outlet sound publicity stunt truth say brodie wagenen baseball sport agency sports statement see workout people industry scout executive player fan impress talent decade tebow play baseball time mean comeback easy task major league catcher chad moeller say statement train tebow arizona say impressed athleticism swing speed power real baseball talent moeller say believe skill potential achieve goal play major league base see past month happen gary sheffield outfielder news tebow attempt comeback baseball greet skepticism twitter junior nease high ponte vedra tebow draw attention major league scout batting home run left fielder ditch glove favor pigskin lead florida national championship scout angeles angels tell weei boston radio station tebow consideration high school junior want draft send information card say scout kotchma