In [89]:
import pandas as pd

def process(f_input, f_output):
    """Perform preprocessing on data.
       1. Add headers
       2. Remove Duplicate titles
       3. Remove subreddits without readable titles
       4. Remove subreddit specific words from titles 
    
    f_input csv: contains data downloaded from reddit
    f_output csv: file name to write processed data to
    
    """
    pd.set_option('display.float_format', lambda x: '%.1f' % x)

    data = pd.read_csv(f_input, names=['id', 'subreddit', 'title', 'ups', 'url', 'created_utc'])

    # Remove Duplicate Titles
    data.drop_duplicates('title', inplace=True)

    # Remove unwanted subreddit
    data = data[data['subreddit'] != 'r/me_irl']

    # Remove repetitive words from subreddit titles
    # for instance 'TIL' from r/todayilearned subreddit

    data.loc[data.subreddit=='r/todayilearned', 
             'title'] = data.loc[data['subreddit'] == 'r/todayilearned',
                                 'title'].str.replace('TIL', '')

    data.loc[data.subreddit=='r/photoshopbattles', 
             'title'] = data.loc[data['subreddit'] == 'r/photoshopbattles',
                                 'title'].str.replace('PsBattle:', '')

    data.to_csv(f_output, index=False)
    return

In [80]:
#for x in data.subreddit.unique():
#    print(data[data['subreddit'] == x][['subreddit', 'title']].head(1))

  subreddit                          title
0   r/funny  Horrible crash with funny end
        subreddit                                              title
2001  r/AskReddit  [Serious] What are important questions to ask ...
            subreddit                                              title
4002  r/todayilearned   Melanie Martinez had her home destroyed four ...
      subreddit                                              title
6003  r/science  Worry keeps us awake. A new study shows journa...
        subreddit                                              title
6754  r/worldnews  Saudi women allowed to attend football match f...
     subreddit  title
8755    r/pics  Fall.
      subreddit                                              title
10756    r/IAmA  [AMA Request] Animator/Artist specializing in ...
      subreddit                                              title
11088  r/gaming  MTG creator X blockchain = Nova Blitz, what do...
      subreddit                               

         subreddit                                              title
96411  r/lifehacks  When the chip reader and mag strip on your car...
             subreddit                                              title
98600  r/relationships  My (32M) wife (32F) is jealous because her fri...
       subreddit                                              title
100601   r/Games  Playerunknown confirms that they have actively...
       subreddit                                   title
101700     r/nba  Steven Adams, America's Next Top Model
            subreddit                                              title
103701  r/programming  OOPs, I FP’d again! (Advocating: use best of b...
        subreddit                                              title
105142  r/tattoos  New Gorilla Tattoo from Dino Nemec @ Lonewolf ...
       subreddit                       title
106300    r/nsfw  Julia Morse & Julri Waters
           subreddit                    title
107470  r/cringepics  When Your Ex Gets Dr