# 2.1 Data Cleaning

In [1]:
import pandas as pd
import numpy as np


from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
#import re

#import time

In [2]:
pd.set_option('max_rows', None)
pd.set_option('max_columns', 200)
pd.set_option('display.max_colwidth', 1000)

In [3]:
travel = pd.read_csv('./travel_posts.csv')
solotravel = pd.read_csv('./solotravel_posts.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# dropping filler row from earlier
travel.drop(0, inplace=True)
travel.reset_index(inplace=True, drop=True)

solotravel.drop(0, inplace=True)
solotravel.reset_index(inplace=True, drop=True)

In [5]:
# setting removed and deleted posts as null
travel.loc[:, 'selftext'].replace(['[removed]', '[deleted]'], np.nan, inplace = True)
solotravel.loc[:, 'selftext'].replace(['[removed]', '[deleted]'], np.nan, inplace = True)

In [6]:
# dropping all rows where the body text is null
travel.drop(travel[travel['selftext'].isna()].index,inplace=True)
travel.reset_index(inplace=True, drop=True)

solotravel.drop(solotravel[solotravel['selftext'].isna()].index,inplace=True)
solotravel.reset_index(inplace=True, drop=True)

### Let's look at the 20,000 most recent posts from each subreddit so that the sample sizes are equal

In [7]:
travel20k = travel[0:20_000]
solo20k = solotravel[0:20_000]
df = pd.concat([travel20k, solo20k])
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,title,author,created_utc,subreddit,selftext
0,"2-3 weeks in Europe in Summer 2022, from the US...west to east or east to west?",bobogator,1627526000.0,travel,"Hello. Planning a 2-3 week trip to Europe next summer, and our original plan was to travel west to east (Scotland, London, Normandy, Paris, Venice, Rome). Flights to Scotland are limited, so we're considering starting the trip in Italy and traveling west. Not sure how that'll work with Normandy between Paris and London...maybe Normandy is best handled as a day trip from Paris (maybe stay one night)?\n\nThe Scotland leg is kind of driving the bus here, since the plan is to play the Old Course, and our trip will either be the weeks before monthlong closure for The Open (course closes on 6/18 and reopens on 7/18) or the weeks after it. I figure the course will be busier in those weeks surrounding the tournament/closure, so pushing Scotland to early May might work better for us.\n\nIs east to west a crazy idea? :-) Thanks!"
1,Taking extensive time off,Vegetable_Amount4812,1627524000.0,travel,This might be weird but what do yall do for work? I see some of you going on long trips and all I can think of is how you can take that time off? I know in the US at least you can take 2 weeks off max 3. Which is not enough for me and weirdly it stresses me out.
2,The best historic places to go in Europe?,AbigailWilliams1692,1627521000.0,travel,"If you had only four full days to spend in Europe, one of which HAD to be spent near Stratford-Upon-Avon in England, what would you do and where would you go to see the most history?\n\nI am leaving the United States for the first time to attend an event in England, and I need recommendations for historic sites to visit in my down time. I am not opposed to taking brief flights between European countries."
3,Flights to and from places,benandfriendz,1627520000.0,travel,This might be a stupid question. Actually it is stupid but I can’t figure it out. I’m 18M and I’m traveling soon but I cant figure out how to book a flight to one place and have a different flight leaving somewhere else. Every website just gives me the offer of returning from same place or the same price for a one way ticket. Any help??
4,AMS - customs/Schengen border control time?,pescobar89,1627519000.0,travel,"Any comment from AMS fliers coming from outside the EU, is it reasonable to clear customs, get a train into the city and catch a train from Amsterdam Centraal in 90 minutes on a weekday if you have no checked baggage?\n\nflight arrival is at approx. 14:00, but the direct train I need leaves Centraal at 15:30. Otherwise it's 3-4 transfers and another 2hrs trip."


In [8]:
#binarize subreddit column, 1 for solotravel, 0 for travel
df.rename(columns = {'subreddit': 'solotravel'}, inplace = True)
df.loc[:,'solotravel'].replace({'solotravel':1, 'travel':0}, inplace = True)

### The following will clean up the self text columns, including removing stop words and non-alphabetical characters, while also stemming everything

In [None]:
p_stemmer = PorterStemmer()

In [None]:
sentence = df.loc[1, 'selftext']

In [None]:
sentence = word_tokenize(sentence)

In [None]:
sentence = [token for token in sentence if token.lower() not in stopwords.words('english')]

In [None]:
sentence = [t for t in sentence if re.match(r'[^\W\d]*$', t)]

In [None]:
sentence = [p_stemmer.stem(token) for token in sentence]

In [None]:
sentence = ' '.join(sentence)

In [None]:
df.loc[1, 'selftext'] = sentence

In [None]:
df.head()

### Let's make it a for loop?

In [None]:
p_stemmer = PorterStemmer()
for i in range(100):
    sentence = df.loc[i, 'selftext']
    sentence = word_tokenize(sentence)
    sentence = [token for token in sentence if token.lower() not in stopwords.words('english')]
    sentence = [t for t in sentence if re.match(r'[^\W\d]*$', t)]
    sentence = [p_stemmer.stem(token) for token in sentence]
    sentence = ' '.join(sentence)
    df.loc[i, 'selftext'] = sentence

In [None]:
df.loc[1001, 'selftext']

### alright time to do it for the full df

In [9]:
p_stemmer = PorterStemmer()

In [10]:
%%time
#putting counter in just to make sure that the program is actually running properly
count = 0
for i in range(len(df)):
    sentence = df.loc[i, 'selftext']
    sentence = word_tokenize(sentence)
    sentence = [token for token in sentence if token.lower() not in stopwords.words('english')]
    sentence = [t for t in sentence if re.match(r'[^\W\d]*$', t)]
    sentence = [p_stemmer.stem(token) for token in sentence]
    sentence = ' '.join(sentence)
    df.loc[i, 'selftext'] = sentence
    
    count += 1
    if count % 100 == 0:
        print(count)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17

In [11]:
df.sample(100)

Unnamed: 0,title,author,created_utc,solotravel,selftext
1319,Fat guy with a question about flying,AlmightyMoira,1625331000.0,0,fli north carolina septemb wonder anyon might abl tell airlin allow refund extra ticket thank advanc
37050,Moving to a new city (US) alone,dalishjade,1561647000.0,1,look advic year old femal done lot solo travel visa australia nz backpack trip mostli stay hostel work odd job back us ohio two year feel like readi chang least year two virtual job mobil within us desktop comput set requir hard drive internet connect work ca work kind transient state get fulli set perman locat wish could work laptop backpack around countri oh well think move entir new citi seem pretti daunt though would need find apart actual go could take day get earli get internet set move etc biggest issu would realli like roommat would huge hit bank account pay one bedroom anybodi experi find roommat ahead time citi present add bit detail current focus new orlean ideal choic read peopl friendli long put may issu make social connect love heat like parti cultur realli appeal advic citi particular would consid safe place solo move made decis yet look advic similar person experi might help
7325,Is it rude to wear earphones on long journeys when someone is with you?,elliehagin,1603798000.0,0,hour train journey come partner go want keep convers go whole time rude put earphon ex partner use long journey time bother quit easi go want know seen rude peopl thank
18443,England and Scotland,Larax22,1544102000.0,0,hello love travel england scotland especi edinburgh mother heard everi servic hotel restaur train pretti expens look hotel pretti expens major citi expect like ask suggest citi good connect coastal citi sorri spell incorrect hast brighton whitstabl kentberi probabl would go one hotel switch hotel everi servic night someth travel around bit like spend probabl one half time england day somewher near citi mention move somewher close edinburgh expens would trip approxim much hotel travel would cost know citi good connect rel cheap hotel approach bad expens way could suggest trip like take mom love english garden palac know like know know lot found googl go research start might easier someon live answer question thank read way point thank advanc answer wish nice day
24747,Where are you going as soon as travel bans are lifted?,loosebumhole,1586208000.0,1,read lot stori peopl amaz plan spent month year prepar plan cancel due coronaviru go soon travel prohibit safe wait time immedi make adventur miss anyon still unsur whether cancel trip year
6171,Short layover times?,sjtx1366,1609864000.0,0,hi look flight austin barcelona everyth delta airlin worri short layov time first time fli intern advic navig airport also greatli appreci au gt jfk hr min layov jfk jfk gt cdg hr min layov cdg cdg gt bcn
34392,Big differences between SE Asia and Eastern Europe,lilsamustafa,1566577000.0,1,decid se asia eastern europ next trip would love hear peopl broad big major differ two rel inexperienc travel one solo trip far western europ excit
11945,"r/travel Region of the Week: 'Far North - Nunavut, Greenland, Svalbard'",AutoModerator,1571912000.0,0,hey travel new seri weekli thread want focu region lot offer travel town natur interest place whether lesser known known provid depth suggest like tour thing place eat etc pleas contribut question thought suggest idea stori highlight travel destin whether place want see experi post archiv wiki destin page http link sidebar futur refer pleas direct repetit question http indonesia amp amp amp pleas click http list date futur destin notic area region list like futur topic may prior topic countri citi pleas focu specif region submiss unless prior futur topic http guidelin link extern site make sure relev help someon travel citi pleas includ adequ text link explain describ content help travel perspect gt exampl realli enjoy monterey bay aquarium california enough keep entertain whole day bear mind park site quit pricey go hill three day car park monterey aquarium http gt gt unhelp read blog http gt gt help favourit part drive pch waysid park wrote blog post best place stop http includ b...
16218,Rate our Thailand &amp; Vietnam Itinerary,lefang,1553489000.0,0,group travel thailand vietnam may pleas give us feedback itinerari far april arriv chiang mai april chiang mai may head pai may pai may leav pai chiang mai fli hanoi may hanoi may fli ho chi minh citi may ho chi minh may fli phuket get phi phi may phi phi may island may bangkok may leav bangkok return home
6362,Is there such a thing as a day trip from Rome to Sicily?,Creative20something,1609108000.0,0,time far futur spend day rome would possibl travel sicili back day two arriv depart rome perhap messina area


In [13]:
df.to_csv('./cleaned_data', index = False)

In [15]:
df.isna().sum()

title          0
author         0
created_utc    0
solotravel     0
selftext       0
dtype: int64