In [1]:
import pandas as pd
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/viti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/viti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/viti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Testing set

In [2]:
df = pd.read_csv(
    '../data/train.csv', 
    encoding='utf-8', 
    on_bad_lines='warn',
    usecols=lambda column: column not in ['id']
)

In [3]:
df

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [4]:
class Preprocessing:
    def __init__(self, df, is_train_df=True):
        self._df = df
        self.is_train_df = is_train_df
    
    def _set_dtypes(self):
        self._df['author'][self._df['author'].notna()] = self._df['author'].astype('str')
        self._df['title'][self._df['title'].notna()] = self._df['title'].astype('str')
        self._df['text'][self._df['text'].notna()] = self._df['text'].astype('str')

        if self.is_train_df:
            self._df['label'][self._df['label'].notna()] = self._df['label'].astype('int')
    
    def _remove_duplicates(self):
        self._df = self._df.drop_duplicates()

    def _remove_empty_rows(self):
        self._df = self._df.dropna(subset=['title', 'author', 'text'])

    def _remove_punctuation(self):
        self._df['removed_punc'] = self._df['text'].apply(lambda x: ("".join([ch for ch in x if ch not in string.punctuation])))

    def _tokenize(self, col_name='removed_punc'):
        # We use the WhitespaceTokenizer to not separate the words with contractions (i.e keep don't, didn't, etc. as one token)
        self._df['tokens'] = self._df[col_name].apply(lambda x: nltk.WhitespaceTokenizer().tokenize(x.lower()))
    
    def _remove_short_words(self, col_name='tokens', min_len=3):
        self._df['filtered_tokens'] = self._df[col_name].apply(lambda x: [word for word in x if len(word) > min_len])

    def _remove_stop_words(self, col_name='filtered_tokens'):
        self._df['clean_tokens'] = self._df[col_name].apply(lambda x: [word for word in x if word not in (nltk.corpus.stopwords.words('english'))])

    def _lemmatize(self, col_name='clean_tokens'):
        self._df['lemma_words'] = self._df[col_name].apply(lambda x : [nltk.WordNetLemmatizer().lemmatize(word) for word in x])

    def _clean_text(self, col_name='lemma_words'):
        self._df['clean_text'] = self._df[col_name].apply(lambda x : " ".join([word for word in x]))

    def preprocess(self):
        self._set_dtypes()
        self._remove_duplicates()
        self._remove_empty_rows()

        if self.is_train_df:
            self._remove_punctuation()
            self._tokenize()
            self._remove_short_words()
            self._remove_stop_words()
            self._lemmatize()
            self._clean_text()
        
        return self._df

In [14]:
def preprocess(df, is_train_df=True):
    preprocessing = Preprocessing(df, is_train_df)
    df = preprocessing.preprocess()
    return df

In [6]:
train = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._df['author'][self._df['author'].notna()] = self._df['author'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._df['title'][self._df['title'].notna()] = self._df['title'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._df['text'][self._df['text'].notna()] = self._df['text'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [7]:
train

Unnamed: 0,title,author,text,label,removed_punc,tokens,filtered_tokens,clean_tokens,lemma_words,clean_text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...,"[house, dem, aide, we, didn’t, even, see, come...","[house, aide, didn’t, even, comey’s, letter, u...","[house, aide, didn’t, even, comey’s, letter, j...","[house, aide, didn’t, even, comey’s, letter, j...",house aide didn’t even comey’s letter jason ch...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,"[ever, get, the, feeling, your, life, circles,...","[ever, feeling, your, life, circles, roundabou...","[ever, feeling, life, circles, roundabout, rat...","[ever, feeling, life, circle, roundabout, rath...",ever feeling life circle roundabout rather hea...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...,"[why, the, truth, might, get, you, fired, octo...","[truth, might, fired, october, 2016, tension, ...","[truth, might, fired, october, 2016, tension, ...","[truth, might, fired, october, 2016, tension, ...",truth might fired october 2016 tension intelli...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...,"[videos, 15, civilians, killed, in, single, us...","[videos, civilians, killed, single, airstrike,...","[videos, civilians, killed, single, airstrike,...","[video, civilian, killed, single, airstrike, i...",video civilian killed single airstrike identif...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,"[print, an, iranian, woman, has, been, sentenc...","[print, iranian, woman, been, sentenced, years...","[print, iranian, woman, sentenced, years, pris...","[print, iranian, woman, sentenced, year, priso...",print iranian woman sentenced year prison iran...
...,...,...,...,...,...,...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Rapper T I unloaded on black celebrities who m...,"[rapper, t, i, unloaded, on, black, celebritie...","[rapper, unloaded, black, celebrities, with, d...","[rapper, unloaded, black, celebrities, donald,...","[rapper, unloaded, black, celebrity, donald, t...",rapper unloaded black celebrity donald trump e...
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,When the Green Bay Packers lost to the Washing...,"[when, the, green, bay, packers, lost, to, the...","[when, green, packers, lost, washington, redsk...","[green, packers, lost, washington, redskins, w...","[green, packer, lost, washington, redskin, wee...",green packer lost washington redskin week drop...
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,The Macy’s of today grew from the union of sev...,"[the, macy’s, of, today, grew, from, the, unio...","[macy’s, today, grew, from, union, several, gr...","[macy’s, today, grew, union, several, great, n...","[macy’s, today, grew, union, several, great, n...",macy’s today grew union several great name ame...
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,NATO Russia To Hold Parallel Exercises In Balk...,"[nato, russia, to, hold, parallel, exercises, ...","[nato, russia, hold, parallel, exercises, balk...","[nato, russia, hold, parallel, exercises, balk...","[nato, russia, hold, parallel, exercise, balka...",nato russia hold parallel exercise balkan 1102...


In [8]:
print(f"Finished Preprocessing: \n\n{train.isna().sum()}\n\nrows with NaN values")

Finished Preprocessing: 

title              0
author             0
text               0
label              0
removed_punc       0
tokens             0
filtered_tokens    0
clean_tokens       0
lemma_words        0
clean_text         0
dtype: int64

rows with NaN values


In [9]:
train.to_csv('../data/preprocessed_train.csv', index=False)

# Training set

In [10]:
df = pd.read_csv(
    '../data/test.csv', 
    encoding='utf-8', 
    on_bad_lines='warn',
    usecols=lambda column: column not in ['id']
)

In [11]:
df

Unnamed: 0,title,author,text
0,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...
5195,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [15]:
test = preprocess(df, is_train_df=False)

In [16]:
test

Unnamed: 0,title,author,text
0,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
2,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
6,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori..."
...,...,...,...
5194,Trump on If ’Tapes’ Exist of Comey Conversatio...,Pam Key,Pres. Trump on if “tapes” exist of his convers...
5195,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...


In [17]:
print(f"Finished Preprocessing: \n\n{test.isna().sum()}\n\nrows with NaN values")

Finished Preprocessing: 

title     0
author    0
text      0
dtype: int64

rows with NaN values


In [18]:
test.to_csv('../data/preprocessed_test.csv', index=False)